From fa84ae26d62c7ac2ad6dca18b2d8b12ab83bc900 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Sat, 21 Dec 2019 15:46:46 +0100 Subject: [PATCH] Reformat source code with black. This is the result of: $ black --line-length 119 examples templates transformers utils hubconf.py setup.py There's a lot of fairly long lines in the project. As a consequence, I'm picking the longest widely accepted line length, 119 characters. This is also Thomas' preference, because it allows for explicit variable names, to make the code easier to understand. --- examples/benchmarks.py | 169 +-- examples/contrib/run_camembert.py | 37 +- examples/contrib/run_openai_gpt.py | 163 +-- examples/contrib/run_swag.py | 450 ++++---- examples/contrib/run_transfo_xl.py | 92 +- examples/distillation/distiller.py | 344 +++--- .../distillation/grouped_batch_sampler.py | 13 +- examples/distillation/lm_seqs_dataset.py | 32 +- .../distillation/run_squad_w_distillation.py | 656 +++++++----- .../distillation/scripts/binarized_data.py | 67 +- examples/distillation/scripts/extract.py | 94 +- .../scripts/extract_distilbert.py | 85 +- examples/distillation/scripts/token_counts.py | 34 +- examples/distillation/train.py | 300 +++--- examples/distillation/utils.py | 32 +- examples/mm-imdb/run_mmimdb.py | 413 +++++--- examples/mm-imdb/utils_mmimdb.py | 58 +- examples/pplm/pplm_classification_head.py | 1 + examples/pplm/run_pplm.py | 466 ++++----- examples/pplm/run_pplm_discrim_train.py | 293 ++---- examples/run_bertology.py | 246 +++-- examples/run_generation.py | 64 +- examples/run_glue.py | 475 +++++---- examples/run_lm_finetuning.py | 440 +++++--- examples/run_multiple_choice.py | 454 +++++--- examples/run_ner.py | 376 ++++--- examples/run_squad.py | 678 +++++++----- examples/run_tf_glue.py | 66 +- examples/run_tf_ner.py | 510 ++++----- examples/run_xnli.py | 440 +++++--- ...ert_bertabs_original_pytorch_checkpoint.py | 45 +- examples/summarization/modeling_bertabs.py | 218 +--- examples/summarization/run_summarization.py | 67 +- examples/summarization/utils_summarization.py | 12 +- .../summarization/utils_summarization_test.py | 24 +- examples/test_examples.py | 88 +- examples/utils_multiple_choice.py | 135 +-- examples/utils_ner.py | 60 +- hubconf.py | 11 +- setup.py | 47 +- .../adding_a_new_example_script/run_xxx.py | 573 ++++++---- .../adding_a_new_example_script/utils_xxx.py | 287 ++--- .../adding_a_new_model/configuration_xxx.py | 42 +- ...t_xxx_original_tf_checkpoint_to_pytorch.py | 36 +- .../adding_a_new_model/modeling_tf_xxx.py | 100 +- templates/adding_a_new_model/modeling_xxx.py | 208 ++-- .../tests/modeling_tf_xxx_test.py | 173 +-- .../tests/modeling_xxx_test.py | 166 +-- .../tests/tokenization_xxx_test.py | 30 +- .../adding_a_new_model/tokenization_xxx.py | 65 +- transformers/__init__.py | 415 +++++--- transformers/__main__.py | 13 +- transformers/commands/__init__.py | 1 + transformers/commands/convert.py | 121 ++- transformers/commands/download.py | 15 +- transformers/commands/run.py | 77 +- transformers/commands/serving.py | 81 +- transformers/commands/train.py | 138 +-- transformers/commands/user.py | 72 +- transformers/configuration_albert.py | 52 +- transformers/configuration_auto.py | 81 +- transformers/configuration_bert.py | 72 +- transformers/configuration_camembert.py | 5 +- transformers/configuration_ctrl.py | 4 +- transformers/configuration_distilbert.py | 45 +- transformers/configuration_gpt2.py | 16 +- transformers/configuration_mmbt.py | 4 +- transformers/configuration_openai.py | 4 +- transformers/configuration_roberta.py | 15 +- transformers/configuration_t5.py | 38 +- transformers/configuration_transfo_xl.py | 66 +- transformers/configuration_utils.py | 107 +- transformers/configuration_xlm.py | 89 +- transformers/configuration_xlm_roberta.py | 15 +- transformers/configuration_xlnet.py | 55 +- ...lbert_original_tf_checkpoint_to_pytorch.py | 36 +- ..._bert_original_tf_checkpoint_to_pytorch.py | 36 +- ..._bert_pytorch_checkpoint_to_original_tf.py | 66 +- ..._gpt2_original_tf_checkpoint_to_pytorch.py | 42 +- ...penai_original_tf_checkpoint_to_pytorch.py | 48 +- .../convert_pytorch_checkpoint_to_tf2.py | 463 +++++--- ..._original_pytorch_checkpoint_to_pytorch.py | 103 +- ...rt_t5_original_tf_checkpoint_to_pytorch.py | 36 +- ...fo_xl_original_tf_checkpoint_to_pytorch.py | 78 +- ..._original_pytorch_checkpoint_to_pytorch.py | 38 +- ...xlnet_original_tf_checkpoint_to_pytorch.py | 73 +- transformers/data/__init__.py | 9 +- transformers/data/metrics/__init__.py | 7 +- transformers/data/metrics/squad_metrics.py | 165 ++- transformers/data/processors/__init__.py | 2 +- transformers/data/processors/glue.py | 256 +++-- transformers/data/processors/squad.py | 107 +- transformers/data/processors/utils.py | 110 +- transformers/data/processors/xnli.py | 14 +- transformers/file_utils.py | 140 ++- transformers/hf_api.py | 46 +- transformers/modelcard.py | 80 +- transformers/modeling_albert.py | 206 ++-- transformers/modeling_auto.py | 304 ++++-- transformers/modeling_bert.py | 460 +++++--- transformers/modeling_camembert.py | 49 +- transformers/modeling_ctrl.py | 121 ++- transformers/modeling_distilbert.py | 238 +++-- transformers/modeling_encoder_decoder.py | 25 +- transformers/modeling_gpt2.py | 173 +-- transformers/modeling_mmbt.py | 148 ++- transformers/modeling_openai.py | 147 ++- transformers/modeling_roberta.py | 203 ++-- transformers/modeling_t5.py | 260 +++-- transformers/modeling_tf_albert.py | 270 ++--- transformers/modeling_tf_auto.py | 264 +++-- transformers/modeling_tf_bert.py | 367 ++++--- transformers/modeling_tf_ctrl.py | 137 ++- transformers/modeling_tf_distilbert.py | 244 +++-- transformers/modeling_tf_gpt2.py | 174 ++-- transformers/modeling_tf_openai.py | 167 +-- transformers/modeling_tf_pytorch_utils.py | 107 +- transformers/modeling_tf_roberta.py | 102 +- transformers/modeling_tf_t5.py | 294 +++--- transformers/modeling_tf_transfo_xl.py | 305 +++--- .../modeling_tf_transfo_xl_utilities.py | 87 +- transformers/modeling_tf_utils.py | 179 ++-- transformers/modeling_tf_xlm.py | 236 +++-- transformers/modeling_tf_xlnet.py | 330 +++--- transformers/modeling_transfo_xl.py | 314 +++--- transformers/modeling_transfo_xl_utilities.py | 77 +- transformers/modeling_utils.py | 421 +++++--- transformers/modeling_xlm.py | 304 ++++-- transformers/modeling_xlm_roberta.py | 59 +- transformers/modeling_xlnet.py | 600 +++++++---- transformers/optimization.py | 60 +- transformers/optimization_tf.py | 281 +++-- transformers/pipelines.py | 466 +++++---- .../tests/configuration_common_test.py | 11 +- transformers/tests/hf_api_test.py | 28 +- transformers/tests/model_card_test.py | 64 +- transformers/tests/modeling_albert_test.py | 143 +-- transformers/tests/modeling_auto_test.py | 23 +- transformers/tests/modeling_bert_test.py | 310 ++++-- transformers/tests/modeling_common_test.py | 271 ++--- transformers/tests/modeling_ctrl_test.py | 110 +- .../tests/modeling_distilbert_test.py | 128 ++- .../tests/modeling_encoder_decoder_test.py | 6 +- transformers/tests/modeling_gpt2_test.py | 150 +-- transformers/tests/modeling_openai_test.py | 117 +-- transformers/tests/modeling_roberta_test.py | 185 ++-- transformers/tests/modeling_t5_test.py | 137 ++- transformers/tests/modeling_tf_albert_test.py | 153 ++- transformers/tests/modeling_tf_auto_test.py | 28 +- transformers/tests/modeling_tf_bert_test.py | 233 +++-- transformers/tests/modeling_tf_common_test.py | 90 +- transformers/tests/modeling_tf_ctrl_test.py | 103 +- .../tests/modeling_tf_distilbert_test.py | 128 +-- transformers/tests/modeling_tf_gpt2_test.py | 140 +-- .../tests/modeling_tf_openai_gpt_test.py | 142 +-- .../tests/modeling_tf_roberta_test.py | 166 ++- transformers/tests/modeling_tf_t5_test.py | 93 +- .../tests/modeling_tf_transfo_xl_test.py | 97 +- transformers/tests/modeling_tf_xlm_test.py | 239 +++-- transformers/tests/modeling_tf_xlnet_test.py | 269 +++-- .../tests/modeling_transfo_xl_test.py | 89 +- transformers/tests/modeling_xlm_test.py | 299 ++++-- transformers/tests/modeling_xlnet_test.py | 347 ++++-- transformers/tests/optimization_test.py | 33 +- transformers/tests/optimization_tf_test.py | 10 +- transformers/tests/pipelines_test.py | 122 +-- .../tests/tokenization_albert_test.py | 35 +- transformers/tests/tokenization_auto_test.py | 1 + .../tests/tokenization_bert_japanese_test.py | 129 +-- transformers/tests/tokenization_bert_test.py | 104 +- transformers/tests/tokenization_ctrl_test.py | 20 +- .../tests/tokenization_distilbert_test.py | 10 +- transformers/tests/tokenization_gpt2_test.py | 41 +- .../tests/tokenization_openai_test.py | 41 +- .../tests/tokenization_roberta_test.py | 52 +- transformers/tests/tokenization_t5_test.py | 88 +- .../tests/tokenization_tests_commons.py | 177 ++-- .../tests/tokenization_transfo_xl_test.py | 38 +- transformers/tests/tokenization_utils_test.py | 3 +- transformers/tests/tokenization_xlm_test.py | 42 +- transformers/tests/tokenization_xlnet_test.py | 154 ++- transformers/tests/utils.py | 1 + transformers/tokenization_albert.py | 118 ++- transformers/tokenization_auto.py | 46 +- transformers/tokenization_bert.py | 190 ++-- transformers/tokenization_bert_japanese.py | 142 +-- transformers/tokenization_camembert.py | 60 +- transformers/tokenization_ctrl.py | 68 +- transformers/tokenization_distilbert.py | 21 +- transformers/tokenization_gpt2.py | 120 ++- transformers/tokenization_openai.py | 92 +- transformers/tokenization_roberta.py | 90 +- transformers/tokenization_t5.py | 79 +- transformers/tokenization_transfo_xl.py | 265 ++--- transformers/tokenization_utils.py | 444 ++++---- transformers/tokenization_xlm.py | 985 +++++++++--------- transformers/tokenization_xlm_roberta.py | 75 +- transformers/tokenization_xlnet.py | 102 +- utils/download_glue_data.py | 77 +- utils/link_tester.py | 2 +- 200 files changed, 17452 insertions(+), 12594 deletions(-) diff --git a/examples/benchmarks.py b/examples/benchmarks.py index 26c260b9ec..20b62112b4 100644 --- a/examples/benchmarks.py +++ b/examples/benchmarks.py @@ -247,16 +247,18 @@ the wall, slowly on into the Social Predestination Room. as they entered.""" -def create_setup_and_compute(model_names: List[str], - gpu: bool = True, - tensorflow: bool = False, - average_over: int = 3, - torchscript: bool = False, - xla: bool = False, - amp: bool = False, - fp16: bool = False, - save_to_csv: bool = False, - csv_filename: str = f"results_{round(time())}.csv"): +def create_setup_and_compute( + model_names: List[str], + gpu: bool = True, + tensorflow: bool = False, + average_over: int = 3, + torchscript: bool = False, + xla: bool = False, + amp: bool = False, + fp16: bool = False, + save_to_csv: bool = False, + csv_filename: str = f"results_{round(time())}.csv", +): if xla: tf.config.optimizer.set_jit(True) if amp: @@ -266,7 +268,7 @@ def create_setup_and_compute(model_names: List[str], dictionary = {model_name: {} for model_name in model_names} results = _compute_tensorflow(model_names, dictionary, average_over, amp) else: - device = 'cuda' if (gpu and torch.cuda.is_available()) else 'cpu' + device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu" dictionary = {model_name: {} for model_name in model_names} results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16) @@ -276,34 +278,52 @@ def create_setup_and_compute(model_names: List[str], for batch_size in results[model_name]["bs"]: print("\t\t" + f"===== BATCH SIZE: {batch_size} =====") for slice_size in results[model_name]["ss"]: - result = results[model_name]['results'][batch_size][slice_size] + result = results[model_name]["results"][batch_size][slice_size] if isinstance(result, str): - print(f"\t\t{model_name}/{batch_size}/{slice_size}: " - f"{result}") + print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result}") else: - print(f"\t\t{model_name}/{batch_size}/{slice_size}: " - f"{(round(1000 * result) / 1000)}" - f"s") + print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{(round(1000 * result) / 1000)}" f"s") if save_to_csv: - with open(csv_filename, mode='w') as csv_file: - fieldnames = ['model', - '1x8', '1x64', '1x128', '1x256', '1x512', '1x1024', - '2x8', '2x64', '2x128', '2x256', '2x512', '2x1024', - '4x8', '4x64', '4x128', '4x256', '4x512', '4x1024', - '8x8', '8x64', '8x128', '8x256', '8x512', '8x1024', - ] + with open(csv_filename, mode="w") as csv_file: + fieldnames = [ + "model", + "1x8", + "1x64", + "1x128", + "1x256", + "1x512", + "1x1024", + "2x8", + "2x64", + "2x128", + "2x256", + "2x512", + "2x1024", + "4x8", + "4x64", + "4x128", + "4x256", + "4x512", + "4x1024", + "8x8", + "8x64", + "8x128", + "8x256", + "8x512", + "8x1024", + ] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for model_name in model_names: model_results = { - f'{bs}x{ss}': results[model_name]['results'][bs][ss] + f"{bs}x{ss}": results[model_name]["results"][bs][ss] for bs in results[model_name]["results"] - for ss in results[model_name]['results'][bs] + for ss in results[model_name]["results"][bs] } - writer.writerow({'model': model_name, **model_results}) + writer.writerow({"model": model_name, **model_results}) def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16): @@ -343,7 +363,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, print("Going through model with sequence of shape", sequence.shape) runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3) - average_time = sum(runtimes)/float(len(runtimes)) / 3.0 + average_time = sum(runtimes) / float(len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][slice_size] = average_time except RuntimeError as e: print("Doesn't fit on GPU.", e) @@ -379,7 +399,9 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp): if max_input_size is not None and slice_size > max_input_size: dictionary[model_name]["results"][batch_size][slice_size] = "N/A" else: - sequence = tf.stack([tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size) + sequence = tf.stack( + [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size + ) try: print("Going through model with sequence of shape", sequence.shape) @@ -387,7 +409,7 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp): inference(sequence) runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3) - average_time = sum(runtimes)/float(len(runtimes)) / 3.0 + average_time = sum(runtimes) / float(len(runtimes)) / 3.0 dictionary[model_name]["results"][batch_size][slice_size] = average_time except tf.errors.ResourceExhaustedError as e: print("Doesn't fit on GPU.", e) @@ -399,33 +421,64 @@ def _compute_tensorflow(model_names, dictionary, average_over, amp): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--models", required=False, type=str, default='all', help="Model checkpoints to be provided " - "to the AutoModel classes. Leave " - "blank to benchmark the base version " - "of all available model " - "architectures.") - parser.add_argument("--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " - "models") - parser.add_argument("--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " - "cuda devices") - parser.add_argument("--torchscript", required=False, action="store_true", help="Pytorch only: trace the models " - "using torchscript") - parser.add_argument("--tensorflow", required=False, action="store_true", help="Benchmark the TensorFlow version " - "of the models. Will run on GPU if " - "the correct dependencies are " - "installed") + parser.add_argument( + "--models", + required=False, + type=str, + default="all", + help="Model checkpoints to be provided " + "to the AutoModel classes. Leave " + "blank to benchmark the base version " + "of all available model " + "architectures.", + ) + parser.add_argument( + "--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the " "models" + ) + parser.add_argument( + "--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available " "cuda devices" + ) + parser.add_argument( + "--torchscript", + required=False, + action="store_true", + help="Pytorch only: trace the models " "using torchscript", + ) + parser.add_argument( + "--tensorflow", + required=False, + action="store_true", + help="Benchmark the TensorFlow version " + "of the models. Will run on GPU if " + "the correct dependencies are " + "installed", + ) parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.") - parser.add_argument("--amp", required=False, action="store_true", help="TensorFlow only: use automatic mixed precision acceleration.") - parser.add_argument("--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference.") - parser.add_argument("--keras_predict", required=False, action="store_true", help="Whether to use model.predict " - "instead of model() to do a " - "forward pass.") + parser.add_argument( + "--amp", + required=False, + action="store_true", + help="TensorFlow only: use automatic mixed precision acceleration.", + ) + parser.add_argument( + "--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference." + ) + parser.add_argument( + "--keras_predict", + required=False, + action="store_true", + help="Whether to use model.predict " "instead of model() to do a " "forward pass.", + ) parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.") - parser.add_argument("--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv.") - parser.add_argument("--average_over", required=False, default=30, type=int, help="Times an experiment will be run.") + parser.add_argument( + "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv." + ) + parser.add_argument( + "--average_over", required=False, default=30, type=int, help="Times an experiment will be run." + ) args = parser.parse_args() - if args.models == 'all': + if args.models == "all": args.models = [ "gpt2", "bert-base-cased", @@ -436,7 +489,7 @@ def main(): "distilbert-base-uncased", "distilgpt2", "roberta-base", - "ctrl" + "ctrl", ] else: args.models = args.models.split() @@ -453,7 +506,7 @@ def main(): fp16=args.fp16, save_to_csv=args.save_to_csv, csv_filename=args.csv_filename, - average_over=args.average_over + average_over=args.average_over, ) else: raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.") @@ -467,11 +520,11 @@ def main(): amp=args.amp, save_to_csv=args.save_to_csv, csv_filename=args.csv_filename, - average_over=args.average_over + average_over=args.average_over, ) else: raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.") -if __name__ == '__main__': - main() +if __name__ == "__main__": + main() diff --git a/examples/contrib/run_camembert.py b/examples/contrib/run_camembert.py index 28144d5167..99f54f5442 100644 --- a/examples/contrib/run_camembert.py +++ b/examples/contrib/run_camembert.py @@ -10,38 +10,37 @@ from transformers.modeling_camembert import CamembertForMaskedLM def fill_mask(masked_input, model, tokenizer, topk=5): # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py - assert masked_input.count('') == 1 + assert masked_input.count("") == 1 input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1 logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item() logits = logits[0, masked_index, :] prob = logits.softmax(dim=0) values, indices = prob.topk(k=topk, dim=0) - topk_predicted_token_bpe = ' '.join([tokenizer.convert_ids_to_tokens(indices[i].item()) - for i in range(len(indices))]) + topk_predicted_token_bpe = " ".join( + [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))] + ) masked_token = tokenizer.mask_token topk_filled_outputs = [] - for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(' ')): - predicted_token = predicted_token_bpe.replace('\u2581', ' ') + for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")): + predicted_token = predicted_token_bpe.replace("\u2581", " ") if " {0}".format(masked_token) in masked_input: - topk_filled_outputs.append(( - masked_input.replace( - ' {0}'.format(masked_token), predicted_token - ), - values[index].item(), - predicted_token, - )) + topk_filled_outputs.append( + ( + masked_input.replace(" {0}".format(masked_token), predicted_token), + values[index].item(), + predicted_token, + ) + ) else: - topk_filled_outputs.append(( - masked_input.replace(masked_token, predicted_token), - values[index].item(), - predicted_token, - )) + topk_filled_outputs.append( + (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,) + ) return topk_filled_outputs -tokenizer = CamembertTokenizer.from_pretrained('camembert-base') -model = CamembertForMaskedLM.from_pretrained('camembert-base') +tokenizer = CamembertTokenizer.from_pretrained("camembert-base") +model = CamembertForMaskedLM.from_pretrained("camembert-base") model.eval() masked_input = "Le camembert est :)" diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py index bc5695becd..f6431c80be 100644 --- a/examples/contrib/run_openai_gpt.py +++ b/examples/contrib/run_openai_gpt.py @@ -36,34 +36,42 @@ from tqdm import tqdm, trange import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset -from transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, - AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME, - get_linear_schedule_with_warmup) +from transformers import ( + OpenAIGPTDoubleHeadsModel, + OpenAIGPTTokenizer, + AdamW, + cached_path, + WEIGHTS_NAME, + CONFIG_NAME, + get_linear_schedule_with_warmup, +) ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz" -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) logger = logging.getLogger(__name__) + def accuracy(out, labels): outputs = np.argmax(out, axis=1) return np.sum(outputs == labels) + def load_rocstories_dataset(dataset_path): """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """ - with open(dataset_path, encoding='utf_8') as f: + with open(dataset_path, encoding="utf_8") as f: f = csv.reader(f) output = [] - next(f) # skip the first line + next(f) # skip the first line for line in tqdm(f): - output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1)) + output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1)) return output + def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token): """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label) @@ -80,56 +88,68 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d for i, (story, cont1, cont2, mc_label), in enumerate(dataset): with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token] - input_ids[i, 0, :len(with_cont1)] = with_cont1 - input_ids[i, 1, :len(with_cont2)] = with_cont2 + input_ids[i, 0, : len(with_cont1)] = with_cont1 + input_ids[i, 1, : len(with_cont2)] = with_cont2 mc_token_ids[i, 0] = len(with_cont1) - 1 mc_token_ids[i, 1] = len(with_cont2) - 1 - lm_labels[i, 0, :len(with_cont1)] = with_cont1 - lm_labels[i, 1, :len(with_cont2)] = with_cont2 + lm_labels[i, 0, : len(with_cont1)] = with_cont1 + lm_labels[i, 1, : len(with_cont2)] = with_cont2 mc_labels[i] = mc_label all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels) tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs)) return tensor_datasets + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--model_name', type=str, default='openai-gpt', - help='pretrained model name') - parser.add_argument("--do_train", action='store_true', help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") - parser.add_argument('--train_dataset', type=str, default='') - parser.add_argument('--eval_dataset', type=str, default='') - parser.add_argument('--seed', type=int, default=42) - parser.add_argument('--num_train_epochs', type=int, default=3) - parser.add_argument('--train_batch_size', type=int, default=8) - parser.add_argument('--eval_batch_size', type=int, default=16) - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument('--max_grad_norm', type=int, default=1) - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training \ - steps to perform. Override num_train_epochs.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before\ - performing a backward/update pass.") - parser.add_argument('--learning_rate', type=float, default=6.25e-5) - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") - parser.add_argument('--lr_schedule', type=str, default='warmup_linear') - parser.add_argument('--weight_decay', type=float, default=0.01) - parser.add_argument('--lm_coef', type=float, default=0.9) - parser.add_argument('--n_valid', type=int, default=374) + parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name") + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--train_dataset", type=str, default="") + parser.add_argument("--eval_dataset", type=str, default="") + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--num_train_epochs", type=int, default=3) + parser.add_argument("--train_batch_size", type=int, default=8) + parser.add_argument("--eval_batch_size", type=int, default=16) + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", type=int, default=1) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training \ + steps to perform. Override num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before\ + performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", type=float, default=6.25e-5) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") + parser.add_argument("--lr_schedule", type=str, default="warmup_linear") + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--lm_coef", type=float, default=0.9) + parser.add_argument("--n_valid", type=int, default=374) - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -152,7 +172,7 @@ def main(): # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset - special_tokens = ['_start_', '_delimiter_', '_classify_'] + special_tokens = ["_start_", "_delimiter_", "_classify_"] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) tokenizer.add_tokens(special_tokens) special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens) @@ -163,6 +183,7 @@ def main(): # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) + def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): @@ -170,6 +191,7 @@ def main(): elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) + logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) @@ -178,8 +200,11 @@ def main(): # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 - input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ - for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) + input_length = max( + len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 + for dataset in encoded_datasets + for story, cont1, cont2, _ in dataset + ) input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders @@ -198,20 +223,23 @@ def main(): if args.do_train: if args.max_steps > 0: t_total = args.max_steps - args.num_train_epochs = args.max_steps //\ - (len(train_dataloader) // args.gradient_accumulation_steps) + 1 + args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: - t_total = len(train_dataloader)\ - // args.gradient_accumulation_steps * args.num_train_epochs + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None @@ -230,14 +258,16 @@ def main(): optimizer.step() optimizer.zero_grad() tr_loss += loss.item() - exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item() + exp_average_loss = ( + loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item() + ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer - model_to_save = model.module if hasattr(model, 'module') else model # Only save the model itself + model_to_save = model.module if hasattr(model, "module") else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) @@ -260,10 +290,12 @@ def main(): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): - _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) + _, mc_loss, _, mc_logits = model( + input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels + ) mc_logits = mc_logits.detach().cpu().numpy() - mc_labels = mc_labels.to('cpu').numpy() + mc_labels = mc_labels.to("cpu").numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() @@ -274,10 +306,8 @@ def main(): eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples - train_loss = tr_loss/nb_tr_steps if args.do_train else None - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'train_loss': train_loss} + train_loss = tr_loss / nb_tr_steps if args.do_train else None + result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: @@ -286,5 +316,6 @@ def main(): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py index 5de93db7fe..d03d1aacec 100644 --- a/examples/contrib/run_swag.py +++ b/examples/contrib/run_swag.py @@ -28,8 +28,7 @@ import glob import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler try: @@ -39,31 +38,23 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, BertConfig, - BertForMultipleChoice, BertTokenizer) +from transformers import WEIGHTS_NAME, BertConfig, BertForMultipleChoice, BertTokenizer from transformers import AdamW, get_linear_schedule_with_warmup logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ - for conf in [BertConfig]), ()) +ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in [BertConfig]), ()) MODEL_CLASSES = { - 'bert': (BertConfig, BertForMultipleChoice, BertTokenizer), + "bert": (BertConfig, BertForMultipleChoice, BertTokenizer), } + class SwagExample(object): """A single training/test example for the SWAG dataset.""" - def __init__(self, - swag_id, - context_sentence, - start_ending, - ending_0, - ending_1, - ending_2, - ending_3, - label = None): + + def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None): self.swag_id = swag_id self.context_sentence = context_sentence self.start_ending = start_ending @@ -94,57 +85,49 @@ class SwagExample(object): return ", ".join(l) -class InputFeatures(object): - def __init__(self, - example_id, - choices_features, - label - ): +class InputFeatures(object): + def __init__(self, example_id, choices_features, label): self.example_id = example_id self.choices_features = [ - { - 'input_ids': input_ids, - 'input_mask': input_mask, - 'segment_ids': segment_ids - } + {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids} for _, input_ids, input_mask, segment_ids in choices_features ] self.label = label + def read_swag_examples(input_file, is_training=True): - with open(input_file, 'r', encoding='utf-8') as f: + with open(input_file, "r", encoding="utf-8") as f: reader = csv.reader(f) lines = [] for line in reader: if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) + line = list(unicode(cell, "utf-8") for cell in line) lines.append(line) - if is_training and lines[0][-1] != 'label': - raise ValueError( - "For training, the input file must contain a label column." - ) + if is_training and lines[0][-1] != "label": + raise ValueError("For training, the input file must contain a label column.") examples = [ SwagExample( - swag_id = line[2], - context_sentence = line[4], - start_ending = line[5], # in the swag dataset, the - # common beginning of each - # choice is stored in "sent2". - ending_0 = line[7], - ending_1 = line[8], - ending_2 = line[9], - ending_3 = line[10], - label = int(line[11]) if is_training else None - ) for line in lines[1:] # we skip the line with the column names + swag_id=line[2], + context_sentence=line[4], + start_ending=line[5], # in the swag dataset, the + # common beginning of each + # choice is stored in "sent2". + ending_0=line[7], + ending_1=line[8], + ending_2=line[9], + ending_3=line[10], + label=int(line[11]) if is_training else None, + ) + for line in lines[1:] # we skip the line with the column names ] return examples -def convert_examples_to_features(examples, tokenizer, max_seq_length, - is_training): + +def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training): """Loads a data file into a list of `InputBatch`s.""" # Swag is a multiple choice task. To perform this task using Bert, @@ -204,23 +187,18 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, logger.info("swag_id: {}".format(example.swag_id)) for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features): logger.info("choice: {}".format(choice_idx)) - logger.info("tokens: {}".format(' '.join(tokens))) - logger.info("input_ids: {}".format(' '.join(map(str, input_ids)))) - logger.info("input_mask: {}".format(' '.join(map(str, input_mask)))) - logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids)))) + logger.info("tokens: {}".format(" ".join(tokens))) + logger.info("input_ids: {}".format(" ".join(map(str, input_ids)))) + logger.info("input_mask: {}".format(" ".join(map(str, input_mask)))) + logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids)))) if is_training: logger.info("label: {}".format(label)) - features.append( - InputFeatures( - example_id = example.swag_id, - choices_features = choices_features, - label = label - ) - ) + features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label)) return features + def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" @@ -237,18 +215,14 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): else: tokens_b.pop() + def accuracy(out, labels): outputs = np.argmax(out, axis=1) return np.sum(outputs == labels) + def select_field(features, field): - return [ - [ - choice[field] - for choice in feature.choices_features - ] - for feature in features - ] + return [[choice[field] for choice in feature.choices_features] for feature in features] def set_seed(args): @@ -258,24 +232,28 @@ def set_seed(args): if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) + def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file - cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length))) + cached_features_file = os.path.join( + os.path.dirname(input_file), + "cached_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + ), + ) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_swag_examples(input_file) - features = convert_examples_to_features( - examples, tokenizer, args.max_seq_length, not evaluate) + features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) @@ -285,21 +263,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset - all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long) - all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long) - all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long) + all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long) + all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long) + all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long) all_label = torch.tensor([f.label for f in features], dtype=torch.long) if evaluate: - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_label) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) else: - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_label) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if output_examples: return dataset, examples, features return dataset + + def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: @@ -316,13 +294,18 @@ def train(args, train_dataset, model, tokenizer): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) if args.fp16: try: from apex import amp @@ -336,17 +319,21 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -360,11 +347,13 @@ def train(args, train_dataset, model, tokenizer): for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - #'token_type_ids': None if args.model_type == 'xlm' else batch[2], - 'token_type_ids': batch[2], - 'labels': batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + #'token_type_ids': None if args.model_type == 'xlm' else batch[2], + "token_type_ids": batch[2], + "labels": batch[3], + } # if args.model_type in ['xlnet', 'xlm']: # inputs.update({'cls_index': batch[5], # 'p_mask': batch[6]}) @@ -372,7 +361,7 @@ def train(args, train_dataset, model, tokenizer): loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training + loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -393,23 +382,27 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + tb_writer.add_scalar("eval_{}".format(key), value, global_step) + tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) + tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_vocabulary(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -424,6 +417,7 @@ def train(args, train_dataset, model, tokenizer): return global_step, tr_loss / global_step + def evaluate(args, model, tokenizer, prefix=""): dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) @@ -440,7 +434,6 @@ def evaluate(args, model, tokenizer, prefix=""): logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) - eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 @@ -448,11 +441,13 @@ def evaluate(args, model, tokenizer, prefix=""): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - # 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids - 'token_type_ids': batch[2], - 'labels': batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + # 'token_type_ids': None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids + "token_type_ids": batch[2], + "labels": batch[3], + } # if args.model_type in ['xlnet', 'xlm']: # inputs.update({'cls_index': batch[4], @@ -462,17 +457,16 @@ def evaluate(args, model, tokenizer, prefix=""): eval_loss += tmp_eval_loss.mean().item() logits = logits.detach().cpu().numpy() - label_ids = inputs['labels'].to('cpu').numpy() + label_ids = inputs["labels"].to("cpu").numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 - nb_eval_examples += inputs['input_ids'].size(0) + nb_eval_examples += inputs["input_ids"].size(0) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy} + result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: @@ -483,92 +477,144 @@ def evaluate(args, model, tokenizer, prefix=""): return result + def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--train_file", default=None, type=str, required=True, - help="SWAG csv for training. E.g., train.csv") - parser.add_argument("--predict_file", default=None, type=str, required=True, - help="SWAG csv for predictions. E.g., val.csv or test.csv") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model checkpoints and predictions will be written.") + parser.add_argument( + "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv" + ) + parser.add_argument( + "--predict_file", + default=None, + type=str, + required=True, + help="SWAG csv for predictions. E.g., val.csv or test.csv", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints and predictions will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--max_seq_length", default=384, type=int, - help="The maximum total input sequence length after tokenization. Sequences " - "longer than this will be truncated, and sequences shorter than this will be padded.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--max_seq_length", + default=384, + type=int, + help="The maximum total input sequence length after tokenization. Sequences " + "longer than this will be truncated, and sequences shorter than this will be padded.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument("--local_rank", type=int, default=-1, - help="local_rank for distributed training on gpus") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -580,16 +626,24 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -601,8 +655,12 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) - model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case + ) + model = model_class.from_pretrained( + args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -617,7 +675,6 @@ def main(): global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Save the trained model and the tokenizer if args.local_rank == -1 or torch.distributed.get_rank() == 0: # Create output directory if needed @@ -627,19 +684,20 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) - # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: @@ -650,14 +708,16 @@ def main(): checkpoints = [args.model_name_or_path] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) tokenizer = tokenizer_class.from_pretrained(checkpoint) model.to(args.device) @@ -665,7 +725,7 @@ def main(): # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) - result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) + result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) diff --git a/examples/contrib/run_transfo_xl.py b/examples/contrib/run_transfo_xl.py index f5375269b8..1ef66bef1e 100644 --- a/examples/contrib/run_transfo_xl.py +++ b/examples/contrib/run_transfo_xl.py @@ -30,44 +30,36 @@ import torch from transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) logger = logging.getLogger(__name__) + def main(): - parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') - parser.add_argument('--model_name', type=str, default='transfo-xl-wt103', - help='pretrained model name') - parser.add_argument('--split', type=str, default='test', - choices=['all', 'valid', 'test'], - help='which split to evaluate') - parser.add_argument('--batch_size', type=int, default=10, - help='batch size') - parser.add_argument('--tgt_len', type=int, default=128, - help='number of tokens to predict') - parser.add_argument('--ext_len', type=int, default=0, - help='length of the extended context') - parser.add_argument('--mem_len', type=int, default=1600, - help='length of the retained previous heads') - parser.add_argument('--clamp_len', type=int, default=1000, - help='max positional embedding index') - parser.add_argument('--no_cuda', action='store_true', - help='Do not use CUDA even though CUA is available') - parser.add_argument('--work_dir', type=str, required=True, - help='path to the work_dir') - parser.add_argument('--no_log', action='store_true', - help='do not log the eval result') - parser.add_argument('--same_length', action='store_true', - help='set same length attention with masking') - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model") + parser.add_argument("--model_name", type=str, default="transfo-xl-wt103", help="pretrained model name") + parser.add_argument( + "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate" + ) + parser.add_argument("--batch_size", type=int, default=10, help="batch size") + parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict") + parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context") + parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads") + parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index") + parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available") + parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir") + parser.add_argument("--no_log", action="store_true", help="do not log the eval result") + parser.add_argument("--same_length", action="store_true", help="set same length attention with masking") + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() - assert args.ext_len >= 0, 'extended context length must be non-negative' + assert args.ext_len >= 0, "extended context length must be non-negative" if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -84,17 +76,18 @@ def main(): corpus = TransfoXLCorpus.from_pretrained(args.model_name) ntokens = len(corpus.vocab) - va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len, - device=device, ext_len=args.ext_len) - te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len, - device=device, ext_len=args.ext_len) + va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) + te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len) # Load a pre-trained model model = TransfoXLLMHeadModel.from_pretrained(args.model_name) model = model.to(device) - logger.info('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( - args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) + logger.info( + "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format( + args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len + ) + ) model.reset_length(args.tgt_len, args.ext_len, args.mem_len) if args.clamp_len > 0: @@ -108,7 +101,7 @@ def main(): def evaluate(eval_iter): # Turn on evaluation mode which disables dropout. model.eval() - total_len, total_loss = 0, 0. + total_len, total_loss = 0, 0.0 start_time = time.time() with torch.no_grad(): mems = None @@ -119,35 +112,34 @@ def main(): total_loss += seq_len * loss.item() total_len += seq_len total_time = time.time() - start_time - logger.info('Time : {:.2f}s, {:.2f}ms/segment'.format( - total_time, 1000 * total_time / (idx+1))) + logger.info("Time : {:.2f}s, {:.2f}ms/segment".format(total_time, 1000 * total_time / (idx + 1))) return total_loss / total_len # Run on test data. - if args.split == 'all': + if args.split == "all": test_loss = evaluate(te_iter) valid_loss = evaluate(va_iter) - elif args.split == 'valid': + elif args.split == "valid": valid_loss = evaluate(va_iter) test_loss = None - elif args.split == 'test': + elif args.split == "test": test_loss = evaluate(te_iter) valid_loss = None def format_log(loss, split): - log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( - split, loss, math.exp(loss)) + log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss)) return log_str - log_str = '' + log_str = "" if valid_loss is not None: - log_str += format_log(valid_loss, 'valid') + log_str += format_log(valid_loss, "valid") if test_loss is not None: - log_str += format_log(test_loss, 'test') + log_str += format_log(test_loss, "test") - logger.info('=' * 100) + logger.info("=" * 100) logger.info(log_str) - logger.info('=' * 100) + logger.info("=" * 100) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index d5a86247a8..e3bf0d443e 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -40,14 +40,12 @@ from utils import logger from lm_seqs_dataset import LmSeqsDataset from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups + class Distiller: - def __init__(self, - params: dict, - dataset: LmSeqsDataset, - token_probs: torch.tensor, - student: nn.Module, - teacher: nn.Module): - logger.info('Initializing Distiller') + def __init__( + self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module + ): + logger.info("Initializing Distiller") self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu @@ -70,12 +68,10 @@ class Distiller: else: sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False) - self.dataloader = DataLoader(dataset=dataset, - batch_sampler=sampler, - collate_fn=dataset.batch_sequences) + self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences) self.temperature = params.temperature - assert self.temperature > 0. + assert self.temperature > 0.0 self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm @@ -85,18 +81,18 @@ class Distiller: self.mlm = params.mlm if self.mlm: - logger.info(f'Using MLM loss for LM step.') + logger.info(f"Using MLM loss for LM step.") self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand]) - self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs - self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs + self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs + self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() else: - logger.info(f'Using CLM loss for LM step.') + logger.info(f"Using CLM loss for LM step.") self.epoch = 0 self.n_iter = 0 @@ -107,38 +103,54 @@ class Distiller: self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_clm = 0 - if self.alpha_mse > 0.: self.last_loss_mse = 0 - if self.alpha_cos > 0.: self.last_loss_cos = 0 + if self.alpha_mse > 0.0: + self.last_loss_mse = 0 + if self.alpha_cos > 0.0: + self.last_loss_cos = 0 self.last_log = 0 - self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean') + self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) - if self.alpha_mse > 0.: - self.mse_loss_fct = nn.MSELoss(reduction='sum') - if self.alpha_cos > 0.: - self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean') + if self.alpha_mse > 0.0: + self.mse_loss_fct = nn.MSELoss(reduction="sum") + if self.alpha_cos > 0.0: + self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean") - logger.info('--- Initializing model optimizer') + logger.info("--- Initializing model optimizer") assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = len(self.dataloader) - num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 + num_train_optimization_steps = ( + int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 + ) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay}, - {'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0} + { + "params": [ + p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad + ], + "weight_decay": params.weight_decay, + }, + { + "params": [ + p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad + ], + "weight_decay": 0.0, + }, ] - logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad])) + logger.info( + "------ Number of trainable parameters (student): %i" + % sum([p.numel() for p in self.student.parameters() if p.requires_grad]) + ) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) - self.optimizer = AdamW(optimizer_grouped_parameters, - lr=params.learning_rate, - eps=params.adam_epsilon, - betas=(0.9, 0.98)) + self.optimizer = AdamW( + optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98) + ) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) - self.scheduler = get_linear_schedule_with_warmup(self.optimizer, - num_warmup_steps=warmup_steps, - num_training_steps=num_train_optimization_steps) + self.scheduler = get_linear_schedule_with_warmup( + self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps + ) if self.fp16: try: @@ -146,33 +158,36 @@ class Distiller: except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level") - self.student, self.optimizer = amp.initialize(self.student, - self.optimizer, - opt_level=self.params.fp16_opt_level) + self.student, self.optimizer = amp.initialize( + self.student, self.optimizer, opt_level=self.params.fp16_opt_level + ) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel + logger.info("Using apex.parallel.DistributedDataParallel for distributed training.") self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel + logger.info("Using nn.parallel.DistributedDataParallel for distributed training.") - self.student = DistributedDataParallel(self.student, - device_ids=[params.local_rank], - output_device=params.local_rank, - find_unused_parameters=True) + self.student = DistributedDataParallel( + self.student, + device_ids=[params.local_rank], + output_device=params.local_rank, + find_unused_parameters=True, + ) self.is_master = params.is_master if self.is_master: - logger.info('--- Initializing Tensorboard') - self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train')) - self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0) - self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0) + logger.info("--- Initializing Tensorboard") + self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train")) + self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0) + self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0) - def prepare_batch_mlm(self, - batch): + def prepare_batch_mlm(self, batch): """ Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM. @@ -192,7 +207,7 @@ class Distiller: token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) assert token_ids.size(0) == lengths.size(0) - attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]) + attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None] bs, max_seq_len = token_ids.size() mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids) @@ -200,11 +215,13 @@ class Distiller: x_prob = self.token_probs[token_ids.flatten()] n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item()) tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False) - pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility + pred_mask = torch.zeros( + bs * max_seq_len, dtype=torch.bool, device=token_ids.device + ) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility pred_mask[tgt_ids] = 1 pred_mask = pred_mask.view(bs, max_seq_len) - pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0 + pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0 # mask a number of words == 0 [8] (faster with fp16) if self.fp16: @@ -213,26 +230,29 @@ class Distiller: pred_mask = pred_mask.view(-1) n2 = max(n1 % 8, 8 * (n1 // 8)) if n2 != n1: - pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0 + pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0 pred_mask = pred_mask.view(bs, max_seq_len) assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item() _token_ids_real = token_ids[pred_mask] _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size) - _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token']) + _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"]) probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True) - _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long() + _token_ids = ( + _token_ids_mask * (probs == 0).long() + + _token_ids_real * (probs == 1).long() + + _token_ids_rand * (probs == 2).long() + ) token_ids = token_ids.masked_scatter(pred_mask, _token_ids) - mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility + mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size return token_ids, attn_mask, mlm_labels - def prepare_batch_clm(self, - batch): + def prepare_batch_clm(self, batch): """ Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM. @@ -252,18 +272,16 @@ class Distiller: token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) assert token_ids.size(0) == lengths.size(0) - attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]) + attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None] clm_labels = token_ids.new(token_ids.size()).copy_(token_ids) - clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility + clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size return token_ids, attn_mask, clm_labels - def round_batch(self, - x: torch.tensor, - lengths: torch.tensor): + def round_batch(self, x: torch.tensor, lengths: torch.tensor): """ For float16 only. Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8. @@ -299,9 +317,9 @@ class Distiller: pad = 8 - (ml1 % 8) ml2 = ml1 + pad if self.mlm: - pad_id = self.params.special_tok_ids['pad_token'] + pad_id = self.params.special_tok_ids["pad_token"] else: - pad_id = self.params.special_tok_ids['unk_token'] + pad_id = self.params.special_tok_ids["unk_token"] padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id) x = torch.cat([x, padding_tensor], 1) assert x.size() == (bs2, ml2) @@ -314,20 +332,22 @@ class Distiller: """ The real training loop. """ - if self.is_master: logger.info('Starting training') + if self.is_master: + logger.info("Starting training") self.last_log = time.time() self.student.train() self.teacher.eval() for _ in range(self.params.n_epoch): - if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}') + if self.is_master: + logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}") if self.multi_gpu: torch.distributed.barrier() iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0]) for batch in iter_bar: if self.params.n_gpu > 0: - batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch) + batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch) if self.mlm: token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch) @@ -336,22 +356,21 @@ class Distiller: self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels) iter_bar.update() - iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}', - 'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'}) + iter_bar.set_postfix( + {"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"} + ) iter_bar.close() - if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}') + if self.is_master: + logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}") self.end_epoch() if self.is_master: - logger.info(f'Save very last checkpoint as `pytorch_model.bin`.') - self.save_checkpoint(checkpoint_name=f'pytorch_model.bin') - logger.info('Training is finished') + logger.info(f"Save very last checkpoint as `pytorch_model.bin`.") + self.save_checkpoint(checkpoint_name=f"pytorch_model.bin") + logger.info("Training is finished") - def step(self, - input_ids: torch.tensor, - attention_mask: torch.tensor, - lm_labels: torch.tensor): + def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor): """ One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation), and possibly a parameter update (depending on the gradient accumulation). @@ -363,78 +382,91 @@ class Distiller: lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM). """ if self.mlm: - s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size) + s_logits, s_hidden_states = self.student( + input_ids=input_ids, attention_mask=attention_mask + ) # (bs, seq_length, voc_size) with torch.no_grad(): - t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size) + t_logits, t_hidden_states = self.teacher( + input_ids=input_ids, attention_mask=attention_mask + ) # (bs, seq_length, voc_size) else: - s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size) + s_logits, _, s_hidden_states = self.student( + input_ids=input_ids, attention_mask=None + ) # (bs, seq_length, voc_size) with torch.no_grad(): - t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size) + t_logits, _, t_hidden_states = self.teacher( + input_ids=input_ids, attention_mask=None + ) # (bs, seq_length, voc_size) assert s_logits.size() == t_logits.size() - #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 - #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2 + # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 + # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2 if self.params.restrict_ce_to_mask: - mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size) + mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size) else: - mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size) - s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask - s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask - t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask - t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask + mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size) + s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask + s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask + t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask + t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask assert t_logits_slct.size() == s_logits_slct.size() - loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1), - F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2 - loss = self.alpha_ce*loss_ce + loss_ce = ( + self.ce_loss_fct( + F.log_softmax(s_logits_slct / self.temperature, dim=-1), + F.softmax(t_logits_slct / self.temperature, dim=-1), + ) + * (self.temperature) ** 2 + ) + loss = self.alpha_ce * loss_ce - if self.alpha_mlm > 0.: + if self.alpha_mlm > 0.0: loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1)) loss += self.alpha_mlm * loss_mlm - if self.alpha_clm > 0.: + if self.alpha_clm > 0.0: shift_logits = s_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() - loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1)) + loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss += self.alpha_clm * loss_clm - if self.alpha_mse > 0.: - loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction + if self.alpha_mse > 0.0: + loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size( + 0 + ) # Reproducing batchmean reduction loss += self.alpha_mse * loss_mse - if self.alpha_cos > 0.: - s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim) - t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim) - mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim) + if self.alpha_cos > 0.0: + s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim) + t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim) + mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim) assert s_hidden_states.size() == t_hidden_states.size() dim = s_hidden_states.size(-1) - - s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim) - s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim) - t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim) - t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim) - - target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,) + + s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim) + s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim) + t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim) + t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim) + + target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,) loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target) loss += self.alpha_cos * loss_cos self.total_loss_epoch += loss.item() self.last_loss = loss.item() self.last_loss_ce = loss_ce.item() - if self.alpha_mlm > 0.: + if self.alpha_mlm > 0.0: self.last_loss_mlm = loss_mlm.item() - if self.alpha_clm > 0.: + if self.alpha_clm > 0.0: self.last_loss_clm = loss_clm.item() - if self.alpha_mse > 0.: + if self.alpha_mse > 0.0: self.last_loss_mse = loss_mse.item() - if self.alpha_cos > 0.: + if self.alpha_cos > 0.0: self.last_loss_cos = loss_cos.item() self.optimize(loss) self.n_sequences_epoch += input_ids.size(0) - def optimize(self, - loss): + def optimize(self, loss): """ Normalization on the loss (gradient accumulation or distributed training), followed by backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation). @@ -442,7 +474,7 @@ class Distiller: """ # Check for NaN if (loss != loss).data.any(): - logger.error('NaN detected') + logger.error("NaN detected") exit() if self.multi_gpu: @@ -452,6 +484,7 @@ class Distiller: if self.fp16: from apex import amp + with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: @@ -488,53 +521,84 @@ class Distiller: return for param_name, param in self.student.named_parameters(): - self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter) - self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter) + self.tensorboard.add_scalar( + tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter + ) + self.tensorboard.add_scalar( + tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter + ) if param.grad is None: continue - self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter) - self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter) + self.tensorboard.add_scalar( + tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter + ) + self.tensorboard.add_scalar( + tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter + ) - self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter) + self.tensorboard.add_scalar( + tag="losses/cum_avg_loss_epoch", + scalar_value=self.total_loss_epoch / self.n_iter, + global_step=self.n_total_iter, + ) self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter) - self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter) - if self.alpha_mlm > 0.: - self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter) - if self.alpha_clm > 0.: - self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter) - if self.alpha_mse > 0.: - self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter) - if self.alpha_cos > 0.: - self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter) - self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter) - - self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter) - self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter) + self.tensorboard.add_scalar( + tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter + ) + if self.alpha_mlm > 0.0: + self.tensorboard.add_scalar( + tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter + ) + if self.alpha_clm > 0.0: + self.tensorboard.add_scalar( + tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter + ) + if self.alpha_mse > 0.0: + self.tensorboard.add_scalar( + tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter + ) + if self.alpha_cos > 0.0: + self.tensorboard.add_scalar( + tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter + ) + self.tensorboard.add_scalar( + tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter + ) + + self.tensorboard.add_scalar( + tag="global/memory_usage", + scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000, + global_step=self.n_total_iter, + ) + self.tensorboard.add_scalar( + tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter + ) def end_epoch(self): """ Finally arrived at the end of epoch (full pass on dataset). Do some tensorboard logging and checkpoint saving. """ - logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.') + logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.") if self.is_master: - self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth') - self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch) + self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth") + self.tensorboard.add_scalar( + tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch + ) self.epoch += 1 self.n_sequences_epoch = 0 self.n_iter = 0 self.total_loss_epoch = 0 - def save_checkpoint(self, - checkpoint_name: str = 'checkpoint.pth'): + def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"): """ Save the current state. Only by the master process. """ if not self.is_master: return - mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student + mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student mdl_to_save.config.save_pretrained(self.dump_path) state_dict = mdl_to_save.state_dict() torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name)) diff --git a/examples/distillation/grouped_batch_sampler.py b/examples/distillation/grouped_batch_sampler.py index 46d943a3d4..1132fdb582 100644 --- a/examples/distillation/grouped_batch_sampler.py +++ b/examples/distillation/grouped_batch_sampler.py @@ -23,12 +23,14 @@ from torch.utils.data.sampler import BatchSampler, Sampler from utils import logger + def _quantize(x, bins): bins = copy.deepcopy(bins) bins = sorted(bins) quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) return quantized + def create_lengths_groups(lengths, k=0): bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10] groups = _quantize(lengths, bins) @@ -39,6 +41,7 @@ def create_lengths_groups(lengths, k=0): logger.info("Count of instances per bin: {}".format(counts)) return groups + class GroupedBatchSampler(BatchSampler): """ Wraps another sampler to yield a mini-batch of indices. @@ -53,11 +56,11 @@ class GroupedBatchSampler(BatchSampler): 0, i.e. they must be in the range [0, num_groups). batch_size (int): Size of mini-batch. """ + def __init__(self, sampler, group_ids, batch_size): if not isinstance(sampler, Sampler): raise ValueError( - "sampler should be an instance of " - "torch.utils.data.Sampler, but got sampler={}".format(sampler) + "sampler should be an instance of " "torch.utils.data.Sampler, but got sampler={}".format(sampler) ) self.sampler = sampler self.group_ids = group_ids @@ -73,7 +76,7 @@ class GroupedBatchSampler(BatchSampler): buffer_per_group[group_id].append(idx) samples_per_group[group_id].append(idx) if len(buffer_per_group[group_id]) == self.batch_size: - yield buffer_per_group[group_id] #TODO + yield buffer_per_group[group_id] # TODO num_batches += 1 del buffer_per_group[group_id] assert len(buffer_per_group[group_id]) < self.batch_size @@ -90,8 +93,8 @@ class GroupedBatchSampler(BatchSampler): for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]): batch_idx.extend(idxs) if len(batch_idx) >= self.batch_size: - yield batch_idx[:self.batch_size] - batch_idx = batch_idx[self.batch_size:] + yield batch_idx[: self.batch_size] + batch_idx = batch_idx[self.batch_size :] num_remaining -= 1 if len(batch_idx) > 0: yield batch_idx diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py index 54e9742ce8..bb0d80f38b 100644 --- a/examples/distillation/lm_seqs_dataset.py +++ b/examples/distillation/lm_seqs_dataset.py @@ -21,6 +21,7 @@ from torch.utils.data import Dataset import numpy as np from utils import logger + class LmSeqsDataset(Dataset): """Custom Dataset wrapping language modeling sequences. @@ -32,9 +33,7 @@ class LmSeqsDataset(Dataset): data: `List[np.array[int]] """ - def __init__(self, - params, - data): + def __init__(self, params, data): self.params = params self.token_ids = np.array(data) @@ -57,7 +56,7 @@ class LmSeqsDataset(Dataset): Some sanity checks """ assert len(self.token_ids) == len(self.lengths) - assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) + assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) def remove_long_sequences(self): """ @@ -65,17 +64,17 @@ class LmSeqsDataset(Dataset): """ max_len = self.params.max_model_input_size indices = self.lengths > max_len - logger.info(f'Splitting {sum(indices)} too long sequences.') + logger.info(f"Splitting {sum(indices)} too long sequences.") def divide_chunks(l, n): - return [l[i:i + n] for i in range(0, len(l), n)] + return [l[i : i + n] for i in range(0, len(l), n)] new_tok_ids = [] new_lengths = [] if self.params.mlm: - cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token'] + cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"] else: - cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token'] + cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"] for seq_, len_ in zip(self.token_ids, self.lengths): assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_ @@ -84,7 +83,7 @@ class LmSeqsDataset(Dataset): new_lengths.append(len_) else: sub_seqs = [] - for sub_s in divide_chunks(seq_, max_len-2): + for sub_s in divide_chunks(seq_, max_len - 2): if sub_s[0] != cls_id: sub_s = np.insert(sub_s, 0, cls_id) if sub_s[-1] != sep_id: @@ -108,7 +107,7 @@ class LmSeqsDataset(Dataset): self.token_ids = self.token_ids[indices] self.lengths = self.lengths[indices] new_size = len(self) - logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.') + logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.") def print_statistics(self): """ @@ -116,7 +115,7 @@ class LmSeqsDataset(Dataset): """ if not self.params.is_master: return - logger.info(f'{len(self)} sequences') + logger.info(f"{len(self)} sequences") # data_len = sum(self.lengths) # nb_unique_tokens = len(Counter(list(chain(*self.token_ids)))) # logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)') @@ -125,8 +124,7 @@ class LmSeqsDataset(Dataset): # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids]) # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)') - def batch_sequences(self, - batch): + def batch_sequences(self, batch): """ Do the padding and transform into torch.tensor. """ @@ -139,13 +137,13 @@ class LmSeqsDataset(Dataset): # Pad token ids if self.params.mlm: - pad_idx = self.params.special_tok_ids['pad_token'] + pad_idx = self.params.special_tok_ids["pad_token"] else: - pad_idx = self.params.special_tok_ids['unk_token'] - tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids] + pad_idx = self.params.special_tok_ids["unk_token"] + tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids] assert len(tk_) == len(token_ids) assert all(len(t) == max_seq_len_ for t in tk_) - tk_t = torch.tensor(tk_) # (bs, max_seq_len_) + tk_t = torch.tensor(tk_) # (bs, max_seq_len_) lg_t = torch.tensor(lengths) # (bs) return tk_t, lg_t diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py index 70b65dc1b8..0d5a004eb3 100644 --- a/examples/distillation/run_squad_w_distillation.py +++ b/examples/distillation/run_squad_w_distillation.py @@ -25,8 +25,7 @@ import glob import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler import torch.nn.functional as F import torch.nn as nn @@ -38,19 +37,32 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, BertConfig, - BertForQuestionAnswering, BertTokenizer, - XLMConfig, XLMForQuestionAnswering, - XLMTokenizer, XLNetConfig, - XLNetForQuestionAnswering, - XLNetTokenizer, - DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForQuestionAnswering, + BertTokenizer, + XLMConfig, + XLMForQuestionAnswering, + XLMTokenizer, + XLNetConfig, + XLNetForQuestionAnswering, + XLNetTokenizer, + DistilBertConfig, + DistilBertForQuestionAnswering, + DistilBertTokenizer, +) from transformers import AdamW, get_linear_schedule_with_warmup -from ..utils_squad import (read_squad_examples, convert_examples_to_features, - RawResult, write_predictions, - RawResultExtended, write_predictions_extended) +from ..utils_squad import ( + read_squad_examples, + convert_examples_to_features, + RawResult, + write_predictions, + RawResultExtended, + write_predictions_extended, +) # The follwing import is the official SQuAD evaluation script (2.0). # You can remove it from the dependencies if you are using this script outside of the library @@ -59,16 +71,18 @@ from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ - for conf in (BertConfig, XLNetConfig, XLMConfig)), ()) +ALL_MODELS = sum( + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), () +) MODEL_CLASSES = { - 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) + "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer), + "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), + "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), + "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), } + def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) @@ -76,9 +90,11 @@ def set_seed(args): if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) + def to_list(tensor): return tensor.detach().cpu().tolist() + def train(args, train_dataset, model, tokenizer, teacher=None): """ Train the model """ if args.local_rank in [-1, 0]: @@ -95,13 +111,18 @@ def train(args, train_dataset, model, tokenizer, teacher=None): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) if args.fp16: try: from apex import amp @@ -115,17 +136,21 @@ def train(args, train_dataset, model, tokenizer, teacher=None): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -141,40 +166,47 @@ def train(args, train_dataset, model, tokenizer, teacher=None): if teacher is not None: teacher.eval() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'start_positions': batch[3], - 'end_positions': batch[4]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] - if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[5], - 'p_mask': batch[6]}) + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "start_positions": batch[3], + "end_positions": batch[4], + } + if args.model_type != "distilbert": + inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2] + if args.model_type in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) outputs = model(**inputs) loss, start_logits_stu, end_logits_stu = outputs # Distillation loss if teacher is not None: - if 'token_type_ids' not in inputs: - inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2] + if "token_type_ids" not in inputs: + inputs["token_type_ids"] = None if args.teacher_type == "xlm" else batch[2] with torch.no_grad(): - start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'], - token_type_ids=inputs['token_type_ids'], - attention_mask=inputs['attention_mask']) + start_logits_tea, end_logits_tea = teacher( + input_ids=inputs["input_ids"], + token_type_ids=inputs["token_type_ids"], + attention_mask=inputs["attention_mask"], + ) assert start_logits_tea.size() == start_logits_stu.size() assert end_logits_tea.size() == end_logits_stu.size() - - loss_fct = nn.KLDivLoss(reduction='batchmean') - loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1), - F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2) - loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1), - F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2) - loss_ce = (loss_start + loss_end)/2. - loss = args.alpha_ce*loss_ce + args.alpha_squad*loss + loss_fct = nn.KLDivLoss(reduction="batchmean") + loss_start = loss_fct( + F.log_softmax(start_logits_stu / args.temperature, dim=-1), + F.softmax(start_logits_tea / args.temperature, dim=-1), + ) * (args.temperature ** 2) + loss_end = loss_fct( + F.log_softmax(end_logits_stu / args.temperature, dim=-1), + F.softmax(end_logits_tea / args.temperature, dim=-1), + ) * (args.temperature ** 2) + loss_ce = (loss_start + loss_end) / 2.0 + + loss = args.alpha_ce * loss_ce + args.alpha_squad * loss if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training + loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -195,22 +227,26 @@ def train(args, train_dataset, model, tokenizer, teacher=None): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + tb_writer.add_scalar("eval_{}".format(key), value, global_step) + tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) + tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -246,32 +282,31 @@ def evaluate(args, model, tokenizer, prefix=""): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1] - } - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2] # XLM don't use segment_ids example_indices = batch[3] - if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[4], - 'p_mask': batch[5]}) + if args.model_type in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args.model_type in ['xlnet', 'xlm']: + if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure - result = RawResultExtended(unique_id = unique_id, - start_top_log_probs = to_list(outputs[0][i]), - start_top_index = to_list(outputs[1][i]), - end_top_log_probs = to_list(outputs[2][i]), - end_top_index = to_list(outputs[3][i]), - cls_logits = to_list(outputs[4][i])) + result = RawResultExtended( + unique_id=unique_id, + start_top_log_probs=to_list(outputs[0][i]), + start_top_index=to_list(outputs[1][i]), + end_top_log_probs=to_list(outputs[2][i]), + end_top_index=to_list(outputs[3][i]), + cls_logits=to_list(outputs[4][i]), + ) else: - result = RawResult(unique_id = unique_id, - start_logits = to_list(outputs[0][i]), - end_logits = to_list(outputs[1][i])) + result = RawResult( + unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i]) + ) all_results.append(result) # Compute predictions @@ -282,23 +317,44 @@ def evaluate(args, model, tokenizer, prefix=""): else: output_null_log_odds_file = None - if args.model_type in ['xlnet', 'xlm']: + if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure - write_predictions_extended(examples, features, all_results, args.n_best_size, - args.max_answer_length, output_prediction_file, - output_nbest_file, output_null_log_odds_file, args.predict_file, - model.config.start_n_top, model.config.end_n_top, - args.version_2_with_negative, tokenizer, args.verbose_logging) + write_predictions_extended( + examples, + features, + all_results, + args.n_best_size, + args.max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + args.predict_file, + model.config.start_n_top, + model.config.end_n_top, + args.version_2_with_negative, + tokenizer, + args.verbose_logging, + ) else: - write_predictions(examples, features, all_results, args.n_best_size, - args.max_answer_length, args.do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold) + write_predictions( + examples, + features, + all_results, + args.n_best_size, + args.max_answer_length, + args.do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + args.verbose_logging, + args.version_2_with_negative, + args.null_score_diff_threshold, + ) # Evaluate with the official SQuAD script - evaluate_options = EVAL_OPTS(data_file=args.predict_file, - pred_file=output_prediction_file, - na_prob_file=output_null_log_odds_file) + evaluate_options = EVAL_OPTS( + data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file + ) results = evaluate_on_squad(evaluate_options) return results @@ -309,24 +365,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file - cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length))) + cached_features_file = os.path.join( + os.path.dirname(input_file), + "cached_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + ), + ) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) - examples = read_squad_examples(input_file=input_file, - is_training=not evaluate, - version_2_with_negative=args.version_2_with_negative) - features = convert_examples_to_features(examples=examples, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=not evaluate) + examples = read_squad_examples( + input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative + ) + features = convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) @@ -342,14 +404,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) + dataset = TensorDataset( + all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask + ) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask) + dataset = TensorDataset( + all_input_ids, + all_input_mask, + all_segment_ids, + all_start_positions, + all_end_positions, + all_cls_index, + all_p_mask, + ) if output_examples: return dataset, examples, features @@ -360,121 +429,213 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--train_file", default=None, type=str, required=True, - help="SQuAD json for training. E.g., train-v1.1.json") - parser.add_argument("--predict_file", default=None, type=str, required=True, - help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model checkpoints and predictions will be written.") + parser.add_argument( + "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json" + ) + parser.add_argument( + "--predict_file", + default=None, + type=str, + required=True, + help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints and predictions will be written.", + ) # Distillation parameters (optional) - parser.add_argument('--teacher_type', default=None, type=str, - help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.") - parser.add_argument('--teacher_name_or_path', default=None, type=str, - help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.") - parser.add_argument('--alpha_ce', default=0.5, type=float, - help="Distillation loss linear weight. Only for distillation.") - parser.add_argument('--alpha_squad', default=0.5, type=float, - help="True SQuAD loss linear weight. Only for distillation.") - parser.add_argument('--temperature', default=2.0, type=float, - help="Distillation temperature. Only for distillation.") + parser.add_argument( + "--teacher_type", + default=None, + type=str, + help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.", + ) + parser.add_argument( + "--teacher_name_or_path", + default=None, + type=str, + help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.", + ) + parser.add_argument( + "--alpha_ce", default=0.5, type=float, help="Distillation loss linear weight. Only for distillation." + ) + parser.add_argument( + "--alpha_squad", default=0.5, type=float, help="True SQuAD loss linear weight. Only for distillation." + ) + parser.add_argument( + "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation." + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) - parser.add_argument('--version_2_with_negative', action='store_true', - help='If true, the SQuAD examples contain some that do not have an answer.') - parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, - help="If null_score - best_non_null is greater than the threshold predict null.") + parser.add_argument( + "--version_2_with_negative", + action="store_true", + help="If true, the SQuAD examples contain some that do not have an answer.", + ) + parser.add_argument( + "--null_score_diff_threshold", + type=float, + default=0.0, + help="If null_score - best_non_null is greater than the threshold predict null.", + ) - parser.add_argument("--max_seq_length", default=384, type=int, - help="The maximum total input sequence length after WordPiece tokenization. Sequences " - "longer than this will be truncated, and sequences shorter than this will be padded.") - parser.add_argument("--doc_stride", default=128, type=int, - help="When splitting up a long document into chunks, how much stride to take between chunks.") - parser.add_argument("--max_query_length", default=64, type=int, - help="The maximum number of tokens for the question. Questions longer than this will " - "be truncated to this length.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--max_seq_length", + default=384, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. Sequences " + "longer than this will be truncated, and sequences shorter than this will be padded.", + ) + parser.add_argument( + "--doc_stride", + default=128, + type=int, + help="When splitting up a long document into chunks, how much stride to take between chunks.", + ) + parser.add_argument( + "--max_query_length", + default=64, + type=int, + help="The maximum number of tokens for the question. Questions longer than this will " + "be truncated to this length.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") - parser.add_argument("--n_best_size", default=20, type=int, - help="The total number of n-best predictions to generate in the nbest_predictions.json output file.") - parser.add_argument("--max_answer_length", default=30, type=int, - help="The maximum length of an answer that can be generated. This is needed because the start " - "and end predictions are not conditioned on one another.") - parser.add_argument("--verbose_logging", action='store_true', - help="If true, all of the warnings related to data processing will be printed. " - "A number of warnings are expected for a normal SQuAD evaluation.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") + parser.add_argument( + "--n_best_size", + default=20, + type=int, + help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", + ) + parser.add_argument( + "--max_answer_length", + default=30, + type=int, + help="The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another.", + ) + parser.add_argument( + "--verbose_logging", + action="store_true", + help="If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.", + ) - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument("--local_rank", type=int, default=-1, - help="local_rank for distributed training on gpus") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -486,16 +647,24 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -506,27 +675,34 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.teacher_type is not None: assert args.teacher_name_or_path is not None - assert args.alpha_ce > 0. - assert args.alpha_ce + args.alpha_squad > 0. - assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT." + assert args.alpha_ce > 0.0 + assert args.alpha_ce + args.alpha_squad > 0.0 + assert args.teacher_type != "distilbert", "We constraint teachers not to be of type DistilBERT." teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type] - teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None) - teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path, - config=teacher_config, - cache_dir=args.cache_dir if args.cache_dir else None) + teacher_config = teacher_config_class.from_pretrained( + args.teacher_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None + ) + teacher = teacher_model_class.from_pretrained( + args.teacher_name_or_path, config=teacher_config, cache_dir=args.cache_dir if args.cache_dir else None + ) teacher.to(args.device) else: teacher = None @@ -544,7 +720,6 @@ def main(): global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -554,41 +729,44 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir, cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.output_dir, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) + tokenizer = tokenizer_class.from_pretrained( + args.output_dir, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None + ) model.to(args.device) - # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint, cache_dir=args.cache_dir if args.cache_dir else None) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) - result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) + result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py index 681cc2de34..40bde7d152 100644 --- a/examples/distillation/scripts/binarized_data.py +++ b/examples/distillation/scripts/binarized_data.py @@ -23,68 +23,65 @@ import numpy as np from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer import logging -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) logger = logging.getLogger(__name__) + def main(): - parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).") - parser.add_argument('--file_path', type=str, default='data/dump.txt', - help='The path to the data.') - parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2']) - parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased', - help="The tokenizer to use.") - parser.add_argument('--dump_file', type=str, default='data/dump', - help='The dump file prefix.') + parser = argparse.ArgumentParser( + description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)." + ) + parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.") + parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"]) + parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.") + parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.") args = parser.parse_args() - - logger.info(f'Loading Tokenizer ({args.tokenizer_name})') - if args.tokenizer_type == 'bert': + logger.info(f"Loading Tokenizer ({args.tokenizer_name})") + if args.tokenizer_type == "bert": tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name) - bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]` - sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` - elif args.tokenizer_type == 'roberta': + bos = tokenizer.special_tokens_map["cls_token"] # `[CLS]` + sep = tokenizer.special_tokens_map["sep_token"] # `[SEP]` + elif args.tokenizer_type == "roberta": tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) - bos = tokenizer.special_tokens_map['cls_token'] # `` - sep = tokenizer.special_tokens_map['sep_token'] # `` - elif args.tokenizer_type == 'gpt2': + bos = tokenizer.special_tokens_map["cls_token"] # `` + sep = tokenizer.special_tokens_map["sep_token"] # `` + elif args.tokenizer_type == "gpt2": tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name) - bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>` - sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>` + bos = tokenizer.special_tokens_map["bos_token"] # `<|endoftext|>` + sep = tokenizer.special_tokens_map["eos_token"] # `<|endoftext|>` - logger.info(f'Loading text from {args.file_path}') - with open(args.file_path, 'r', encoding='utf8') as fp: + logger.info(f"Loading text from {args.file_path}") + with open(args.file_path, "r", encoding="utf8") as fp: data = fp.readlines() - - logger.info(f'Start encoding') - logger.info(f'{len(data)} examples to process.') + logger.info(f"Start encoding") + logger.info(f"{len(data)} examples to process.") rslt = [] iter = 0 interval = 10000 start = time.time() for text in data: - text = f'{bos} {text.strip()} {sep}' + text = f"{bos} {text.strip()} {sep}" token_ids = tokenizer.encode(text, add_special_tokens=False) rslt.append(token_ids) iter += 1 if iter % interval == 0: end = time.time() - logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl') + logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl") start = time.time() - logger.info('Finished binarization') - logger.info(f'{len(data)} examples processed.') + logger.info("Finished binarization") + logger.info(f"{len(data)} examples processed.") - - dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle' + dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle" rslt_ = [np.uint16(d) for d in rslt] random.shuffle(rslt_) - logger.info(f'Dump to {dp_file}') - with open(dp_file, 'wb') as handle: + logger.info(f"Dump to {dp_file}") + with open(dp_file, "wb") as handle: pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py index 5ae1607f3f..9610f8f17a 100644 --- a/examples/distillation/scripts/extract.py +++ b/examples/distillation/scripts/extract.py @@ -20,70 +20,80 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel import torch import argparse -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation") +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation" + ) parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"]) - parser.add_argument("--model_name", default='roberta-large', type=str) - parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str) - parser.add_argument("--vocab_transform", action='store_true') + parser.add_argument("--model_name", default="roberta-large", type=str) + parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_roberta_048131723.pth", type=str) + parser.add_argument("--vocab_transform", action="store_true") args = parser.parse_args() - - if args.model_type == 'roberta': + if args.model_type == "roberta": model = RobertaForMaskedLM.from_pretrained(args.model_name) - prefix = 'roberta' - elif args.model_type == 'gpt2': + prefix = "roberta" + elif args.model_type == "gpt2": model = GPT2LMHeadModel.from_pretrained(args.model_name) - prefix = 'transformer' + prefix = "transformer" state_dict = model.state_dict() compressed_sd = {} ### Embeddings ### - if args.model_type == 'gpt2': - for param_name in ['wte.weight', 'wpe.weight']: - compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}'] + if args.model_type == "gpt2": + for param_name in ["wte.weight", "wpe.weight"]: + compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"] else: - for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']: - param_name = f'{prefix}.embeddings.{w}.weight' + for w in ["word_embeddings", "position_embeddings", "token_type_embeddings"]: + param_name = f"{prefix}.embeddings.{w}.weight" compressed_sd[param_name] = state_dict[param_name] - for w in ['weight', 'bias']: - param_name = f'{prefix}.embeddings.LayerNorm.{w}' + for w in ["weight", "bias"]: + param_name = f"{prefix}.embeddings.LayerNorm.{w}" compressed_sd[param_name] = state_dict[param_name] ### Transformer Blocks ### std_idx = 0 for teacher_idx in [0, 2, 4, 7, 9, 11]: - if args.model_type == 'gpt2': - for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']: - for w in ['weight', 'bias']: - compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \ - state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}'] - compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias'] + if args.model_type == "gpt2": + for layer in ["ln_1", "attn.c_attn", "attn.c_proj", "ln_2", "mlp.c_fc", "mlp.c_proj"]: + for w in ["weight", "bias"]: + compressed_sd[f"{prefix}.h.{std_idx}.{layer}.{w}"] = state_dict[ + f"{prefix}.h.{teacher_idx}.{layer}.{w}" + ] + compressed_sd[f"{prefix}.h.{std_idx}.attn.bias"] = state_dict[f"{prefix}.h.{teacher_idx}.attn.bias"] else: - for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value', - 'attention.output.dense', 'attention.output.LayerNorm', - 'intermediate.dense', 'output.dense', 'output.LayerNorm']: - for w in ['weight', 'bias']: - compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}'] + for layer in [ + "attention.self.query", + "attention.self.key", + "attention.self.value", + "attention.output.dense", + "attention.output.LayerNorm", + "intermediate.dense", + "output.dense", + "output.LayerNorm", + ]: + for w in ["weight", "bias"]: + compressed_sd[f"{prefix}.encoder.layer.{std_idx}.{layer}.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}" + ] std_idx += 1 ### Language Modeling Head ###s - if args.model_type == 'roberta': - for layer in ['lm_head.decoder.weight', 'lm_head.bias']: - compressed_sd[f'{layer}'] = state_dict[f'{layer}'] + if args.model_type == "roberta": + for layer in ["lm_head.decoder.weight", "lm_head.bias"]: + compressed_sd[f"{layer}"] = state_dict[f"{layer}"] if args.vocab_transform: - for w in ['weight', 'bias']: - compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}'] - compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}'] - elif args.model_type == 'gpt2': - for w in ['weight', 'bias']: - compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}'] - compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight'] + for w in ["weight", "bias"]: + compressed_sd[f"lm_head.dense.{w}"] = state_dict[f"lm_head.dense.{w}"] + compressed_sd[f"lm_head.layer_norm.{w}"] = state_dict[f"lm_head.layer_norm.{w}"] + elif args.model_type == "gpt2": + for w in ["weight", "bias"]: + compressed_sd[f"{prefix}.ln_f.{w}"] = state_dict[f"{prefix}.ln_f.{w}"] + compressed_sd[f"lm_head.weight"] = state_dict[f"lm_head.weight"] - print(f'N layers selected for distillation: {std_idx}') - print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}') + print(f"N layers selected for distillation: {std_idx}") + print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}") - print(f'Save transfered checkpoint to {args.dump_checkpoint}.') + print(f"Save transfered checkpoint to {args.dump_checkpoint}.") torch.save(compressed_sd, args.dump_checkpoint) diff --git a/examples/distillation/scripts/extract_distilbert.py b/examples/distillation/scripts/extract_distilbert.py index fdb0662ca7..8e58db5552 100644 --- a/examples/distillation/scripts/extract_distilbert.py +++ b/examples/distillation/scripts/extract_distilbert.py @@ -20,63 +20,70 @@ from transformers import BertForMaskedLM, RobertaForMaskedLM import torch import argparse -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation") +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation" + ) parser.add_argument("--model_type", default="bert", choices=["bert"]) - parser.add_argument("--model_name", default='bert-base-uncased', type=str) - parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str) - parser.add_argument("--vocab_transform", action='store_true') + parser.add_argument("--model_name", default="bert-base-uncased", type=str) + parser.add_argument("--dump_checkpoint", default="serialization_dir/tf_bert-base-uncased_0247911.pth", type=str) + parser.add_argument("--vocab_transform", action="store_true") args = parser.parse_args() - - if args.model_type == 'bert': + if args.model_type == "bert": model = BertForMaskedLM.from_pretrained(args.model_name) - prefix = 'bert' + prefix = "bert" else: raise ValueError(f'args.model_type should be "bert".') state_dict = model.state_dict() compressed_sd = {} - for w in ['word_embeddings', 'position_embeddings']: - compressed_sd[f'distilbert.embeddings.{w}.weight'] = \ - state_dict[f'{prefix}.embeddings.{w}.weight'] - for w in ['weight', 'bias']: - compressed_sd[f'distilbert.embeddings.LayerNorm.{w}'] = \ - state_dict[f'{prefix}.embeddings.LayerNorm.{w}'] + for w in ["word_embeddings", "position_embeddings"]: + compressed_sd[f"distilbert.embeddings.{w}.weight"] = state_dict[f"{prefix}.embeddings.{w}.weight"] + for w in ["weight", "bias"]: + compressed_sd[f"distilbert.embeddings.LayerNorm.{w}"] = state_dict[f"{prefix}.embeddings.LayerNorm.{w}"] std_idx = 0 for teacher_idx in [0, 2, 4, 7, 9, 11]: - for w in ['weight', 'bias']: - compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}'] - compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}'] - compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}'] + for w in ["weight", "bias"]: + compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.q_lin.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.attention.self.query.{w}" + ] + compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.k_lin.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.attention.self.key.{w}" + ] + compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.v_lin.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.attention.self.value.{w}" + ] - compressed_sd[f'distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}'] - compressed_sd[f'distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}'] + compressed_sd[f"distilbert.transformer.layer.{std_idx}.attention.out_lin.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.attention.output.dense.{w}" + ] + compressed_sd[f"distilbert.transformer.layer.{std_idx}.sa_layer_norm.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.attention.output.LayerNorm.{w}" + ] - compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}'] - compressed_sd[f'distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}'] - compressed_sd[f'distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}'] = \ - state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}'] + compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin1.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.intermediate.dense.{w}" + ] + compressed_sd[f"distilbert.transformer.layer.{std_idx}.ffn.lin2.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.output.dense.{w}" + ] + compressed_sd[f"distilbert.transformer.layer.{std_idx}.output_layer_norm.{w}"] = state_dict[ + f"{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}" + ] std_idx += 1 - compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight'] - compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias'] + compressed_sd[f"vocab_projector.weight"] = state_dict[f"cls.predictions.decoder.weight"] + compressed_sd[f"vocab_projector.bias"] = state_dict[f"cls.predictions.bias"] if args.vocab_transform: - for w in ['weight', 'bias']: - compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}'] - compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}'] + for w in ["weight", "bias"]: + compressed_sd[f"vocab_transform.{w}"] = state_dict[f"cls.predictions.transform.dense.{w}"] + compressed_sd[f"vocab_layer_norm.{w}"] = state_dict[f"cls.predictions.transform.LayerNorm.{w}"] - print(f'N layers selected for distillation: {std_idx}') - print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}') + print(f"N layers selected for distillation: {std_idx}") + print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}") - print(f'Save transfered checkpoint to {args.dump_checkpoint}.') + print(f"Save transfered checkpoint to {args.dump_checkpoint}.") torch.save(compressed_sd, args.dump_checkpoint) diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py index d9de17da4e..623caad4b1 100644 --- a/examples/distillation/scripts/token_counts.py +++ b/examples/distillation/scripts/token_counts.py @@ -20,32 +20,36 @@ import argparse import pickle import logging -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO +) logger = logging.getLogger(__name__) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)") - parser.add_argument("--data_file", type=str, default="data/dump.bert-base-uncased.pickle", - help="The binarized dataset.") - parser.add_argument("--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", - help="The dump file.") +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Token Counts for smoothing the masking probabilities in MLM (cf XLM/word2vec)" + ) + parser.add_argument( + "--data_file", type=str, default="data/dump.bert-base-uncased.pickle", help="The binarized dataset." + ) + parser.add_argument( + "--token_counts_dump", type=str, default="data/token_counts.bert-base-uncased.pickle", help="The dump file." + ) parser.add_argument("--vocab_size", default=30522, type=int) args = parser.parse_args() - logger.info(f'Loading data from {args.data_file}') - with open(args.data_file, 'rb') as fp: + logger.info(f"Loading data from {args.data_file}") + with open(args.data_file, "rb") as fp: data = pickle.load(fp) - logger.info('Counting occurences for MLM.') + logger.info("Counting occurences for MLM.") counter = Counter() for tk_ids in data: counter.update(tk_ids) - counts = [0]*args.vocab_size + counts = [0] * args.vocab_size for k, v in counter.items(): counts[k] = v - logger.info(f'Dump to {args.token_counts_dump}') - with open(args.token_counts_dump, 'wb') as handle: + logger.info(f"Dump to {args.token_counts_dump}") + with open(args.token_counts_dump, "wb") as handle: pickle.dump(counts, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/examples/distillation/train.py b/examples/distillation/train.py index 311f0580ff..37c49ae7b2 100644 --- a/examples/distillation/train.py +++ b/examples/distillation/train.py @@ -35,166 +35,200 @@ from lm_seqs_dataset import LmSeqsDataset MODEL_CLASSES = { - 'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), - 'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), - 'bert': (BertConfig, BertForMaskedLM, BertTokenizer), - 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer) + "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), + "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), + "bert": (BertConfig, BertForMaskedLM, BertTokenizer), + "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), } + def sanity_checks(args): """ A bunch of args sanity checks to perform even starting... """ - assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.) - assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.) + assert (args.mlm and args.alpha_mlm > 0.0) or (not args.mlm and args.alpha_mlm == 0.0) + assert (args.alpha_mlm > 0.0 and args.alpha_clm == 0.0) or (args.alpha_mlm == 0.0 and args.alpha_clm > 0.0) if args.mlm: assert os.path.isfile(args.token_counts) - assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert']) + assert (args.student_type in ["roberta", "distilbert"]) and (args.teacher_type in ["roberta", "bert"]) else: - assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2']) + assert (args.student_type in ["gpt2"]) and (args.teacher_type in ["gpt2"]) - assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert') + assert args.teacher_type == args.student_type or ( + args.student_type == "distilbert" and args.teacher_type == "bert" + ) assert os.path.isfile(args.student_config) if args.student_pretrained_weights is not None: assert os.path.isfile(args.student_pretrained_weights) - if args.freeze_token_type_embds: assert args.student_type in ['roberta'] + if args.freeze_token_type_embds: + assert args.student_type in ["roberta"] + + assert args.alpha_ce >= 0.0 + assert args.alpha_mlm >= 0.0 + assert args.alpha_clm >= 0.0 + assert args.alpha_mse >= 0.0 + assert args.alpha_cos >= 0.0 + assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.0 - assert args.alpha_ce >= 0. - assert args.alpha_mlm >= 0. - assert args.alpha_clm >= 0. - assert args.alpha_mse >= 0. - assert args.alpha_cos >= 0. - assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0. def freeze_pos_embeddings(student, args): - if args.student_type == 'roberta': + if args.student_type == "roberta": student.roberta.embeddings.position_embeddings.weight.requires_grad = False - elif args.student_type == 'gpt2': + elif args.student_type == "gpt2": student.transformer.wpe.weight.requires_grad = False + def freeze_token_type_embeddings(student, args): - if args.student_type == 'roberta': + if args.student_type == "roberta": student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False + def main(): parser = argparse.ArgumentParser(description="Training") - parser.add_argument("--force", action='store_true', - help="Overwrite dump_path if it already exists.") + parser.add_argument("--force", action="store_true", help="Overwrite dump_path if it already exists.") - parser.add_argument("--dump_path", type=str, required=True, - help="The output directory (log, checkpoints, parameters, etc.)") - parser.add_argument("--data_file", type=str, required=True, - help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.") + parser.add_argument( + "--dump_path", type=str, required=True, help="The output directory (log, checkpoints, parameters, etc.)" + ) + parser.add_argument( + "--data_file", + type=str, + required=True, + help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.", + ) - parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True, - help="The student type (DistilBERT, RoBERTa).") - parser.add_argument("--student_config", type=str, required=True, - help="Path to the student configuration.") - parser.add_argument("--student_pretrained_weights", default=None, type=str, - help="Load student initialization checkpoint.") + parser.add_argument( + "--student_type", + type=str, + choices=["distilbert", "roberta", "gpt2"], + required=True, + help="The student type (DistilBERT, RoBERTa).", + ) + parser.add_argument("--student_config", type=str, required=True, help="Path to the student configuration.") + parser.add_argument( + "--student_pretrained_weights", default=None, type=str, help="Load student initialization checkpoint." + ) - parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, - help="Teacher type (BERT, RoBERTa).") - parser.add_argument("--teacher_name", type=str, required=True, - help="The teacher model.") + parser.add_argument( + "--teacher_type", choices=["bert", "roberta", "gpt2"], required=True, help="Teacher type (BERT, RoBERTa)." + ) + parser.add_argument("--teacher_name", type=str, required=True, help="The teacher model.") - parser.add_argument("--temperature", default=2., type=float, - help="Temperature for the softmax temperature.") - parser.add_argument("--alpha_ce", default=0.5, type=float, - help="Linear weight for the distillation loss. Must be >=0.") - parser.add_argument("--alpha_mlm", default=0.0, type=float, - help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.") - parser.add_argument("--alpha_clm", default=0.5, type=float, - help="Linear weight for the CLM loss. Must be >=0.") - parser.add_argument("--alpha_mse", default=0.0, type=float, - help="Linear weight of the MSE loss. Must be >=0.") - parser.add_argument("--alpha_cos", default=0.0, type=float, - help="Linear weight of the cosine embedding loss. Must be >=0.") + parser.add_argument("--temperature", default=2.0, type=float, help="Temperature for the softmax temperature.") + parser.add_argument( + "--alpha_ce", default=0.5, type=float, help="Linear weight for the distillation loss. Must be >=0." + ) + parser.add_argument( + "--alpha_mlm", + default=0.0, + type=float, + help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.", + ) + parser.add_argument("--alpha_clm", default=0.5, type=float, help="Linear weight for the CLM loss. Must be >=0.") + parser.add_argument("--alpha_mse", default=0.0, type=float, help="Linear weight of the MSE loss. Must be >=0.") + parser.add_argument( + "--alpha_cos", default=0.0, type=float, help="Linear weight of the cosine embedding loss. Must be >=0." + ) - parser.add_argument("--mlm", action="store_true", - help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.") - parser.add_argument("--mlm_mask_prop", default=0.15, type=float, - help="Proportion of tokens for which we need to make a prediction.") - parser.add_argument("--word_mask", default=0.8, type=float, - help="Proportion of tokens to mask out.") - parser.add_argument("--word_keep", default=0.1, type=float, - help="Proportion of tokens to keep.") - parser.add_argument("--word_rand", default=0.1, type=float, - help="Proportion of tokens to randomly replace.") - parser.add_argument("--mlm_smoothing", default=0.7, type=float, - help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).") - parser.add_argument("--token_counts", type=str, - help="The token counts in the data_file for MLM.") + parser.add_argument( + "--mlm", action="store_true", help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM." + ) + parser.add_argument( + "--mlm_mask_prop", + default=0.15, + type=float, + help="Proportion of tokens for which we need to make a prediction.", + ) + parser.add_argument("--word_mask", default=0.8, type=float, help="Proportion of tokens to mask out.") + parser.add_argument("--word_keep", default=0.1, type=float, help="Proportion of tokens to keep.") + parser.add_argument("--word_rand", default=0.1, type=float, help="Proportion of tokens to randomly replace.") + parser.add_argument( + "--mlm_smoothing", + default=0.7, + type=float, + help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).", + ) + parser.add_argument("--token_counts", type=str, help="The token counts in the data_file for MLM.") - parser.add_argument("--restrict_ce_to_mask", action='store_true', - help="If true, compute the distilation loss only the [MLM] prediction distribution.") - parser.add_argument("--freeze_pos_embs", action="store_true", - help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.") - parser.add_argument("--freeze_token_type_embds", action="store_true", - help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.") + parser.add_argument( + "--restrict_ce_to_mask", + action="store_true", + help="If true, compute the distilation loss only the [MLM] prediction distribution.", + ) + parser.add_argument( + "--freeze_pos_embs", + action="store_true", + help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.", + ) + parser.add_argument( + "--freeze_token_type_embds", + action="store_true", + help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.", + ) - parser.add_argument("--n_epoch", type=int, default=3, - help="Number of pass on the whole dataset.") - parser.add_argument("--batch_size", type=int, default=5, - help="Batch size (for each process).") - parser.add_argument("--group_by_size", action='store_false', - help="If true, group sequences that have similar length into the same batch. Default is true.") + parser.add_argument("--n_epoch", type=int, default=3, help="Number of pass on the whole dataset.") + parser.add_argument("--batch_size", type=int, default=5, help="Batch size (for each process).") + parser.add_argument( + "--group_by_size", + action="store_false", + help="If true, group sequences that have similar length into the same batch. Default is true.", + ) - parser.add_argument("--gradient_accumulation_steps", type=int, default=50, - help="Gradient accumulation for larger training batches.") - parser.add_argument("--warmup_prop", default=0.05, type=float, - help="Linear warmup proportion.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--learning_rate", default=5e-4, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--adam_epsilon", default=1e-6, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=5.0, type=float, - help="Max gradient norm.") - parser.add_argument("--initializer_range", default=0.02, type=float, - help="Random initialization range.") + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=50, + help="Gradient accumulation for larger training batches.", + ) + parser.add_argument("--warmup_prop", default=0.05, type=float, help="Linear warmup proportion.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=5.0, type=float, help="Max gradient norm.") + parser.add_argument("--initializer_range", default=0.02, type=float, help="Random initialization range.") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--n_gpu", type=int, default=1, - help="Number of GPUs in the node.") - parser.add_argument("--local_rank", type=int, default=-1, - help="Distributed training - Local rank") - parser.add_argument("--seed", type=int, default=56, - help="Random seed") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--n_gpu", type=int, default=1, help="Number of GPUs in the node.") + parser.add_argument("--local_rank", type=int, default=-1, help="Distributed training - Local rank") + parser.add_argument("--seed", type=int, default=56, help="Random seed") - parser.add_argument("--log_interval", type=int, default=500, - help="Tensorboard logging interval.") - parser.add_argument("--checkpoint_interval", type=int, default=4000, - help="Checkpoint interval.") + parser.add_argument("--log_interval", type=int, default=500, help="Tensorboard logging interval.") + parser.add_argument("--checkpoint_interval", type=int, default=4000, help="Checkpoint interval.") args = parser.parse_args() sanity_checks(args) - ## ARGS ## init_gpu_params(args) set_seed(args) if args.is_master: if os.path.exists(args.dump_path): if not args.force: - raise ValueError(f'Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it' - 'Use `--force` if you want to overwrite it') + raise ValueError( + f"Serialization dir {args.dump_path} already exists, but you have not precised wheter to overwrite it" + "Use `--force` if you want to overwrite it" + ) else: shutil.rmtree(args.dump_path) if not os.path.exists(args.dump_path): os.makedirs(args.dump_path) - logger.info(f'Experiment will be dumped and logged in {args.dump_path}') - + logger.info(f"Experiment will be dumped and logged in {args.dump_path}") ### SAVE PARAMS ### - logger.info(f'Param: {args}') - with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f: + logger.info(f"Param: {args}") + with open(os.path.join(args.dump_path, "parameters.json"), "w") as f: json.dump(vars(args), f, indent=4) git_log(args.dump_path) @@ -207,58 +241,50 @@ def main(): for tok_name, tok_symbol in tokenizer.special_tokens_map.items(): idx = tokenizer.all_special_tokens.index(tok_symbol) special_tok_ids[tok_name] = tokenizer.all_special_ids[idx] - logger.info(f'Special tokens {special_tok_ids}') + logger.info(f"Special tokens {special_tok_ids}") args.special_tok_ids = special_tok_ids args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name] - ## DATA LOADER ## - logger.info(f'Loading data from {args.data_file}') - with open(args.data_file, 'rb') as fp: + logger.info(f"Loading data from {args.data_file}") + with open(args.data_file, "rb") as fp: data = pickle.load(fp) - if args.mlm: - logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)') - with open(args.token_counts, 'rb') as fp: + logger.info(f"Loading token counts from {args.token_counts} (already pre-computed)") + with open(args.token_counts, "rb") as fp: counts = pickle.load(fp) - + token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing for idx in special_tok_ids.values(): - token_probs[idx] = 0. # do not predict special tokens + token_probs[idx] = 0.0 # do not predict special tokens token_probs = torch.from_numpy(token_probs) else: token_probs = None - train_lm_seq_dataset = LmSeqsDataset(params=args, data=data) - logger.info(f'Data loader created.') - + logger.info(f"Data loader created.") ## STUDENT ## - logger.info(f'Loading student config from {args.student_config}') + logger.info(f"Loading student config from {args.student_config}") stu_architecture_config = student_config_class.from_pretrained(args.student_config) stu_architecture_config.output_hidden_states = True if args.student_pretrained_weights is not None: - logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}') - student = student_model_class.from_pretrained(args.student_pretrained_weights, - config=stu_architecture_config) + logger.info(f"Loading pretrained weights from {args.student_pretrained_weights}") + student = student_model_class.from_pretrained(args.student_pretrained_weights, config=stu_architecture_config) else: student = student_model_class(stu_architecture_config) - if args.n_gpu > 0: - student.to(f'cuda:{args.local_rank}') - logger.info(f'Student loaded.') - + student.to(f"cuda:{args.local_rank}") + logger.info(f"Student loaded.") ## TEACHER ## teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True) if args.n_gpu > 0: - teacher.to(f'cuda:{args.local_rank}') - logger.info(f'Teacher loaded from {args.teacher_name}.') - + teacher.to(f"cuda:{args.local_rank}") + logger.info(f"Teacher loaded from {args.teacher_name}.") ## FREEZING ## if args.freeze_pos_embs: @@ -266,7 +292,6 @@ def main(): if args.freeze_token_type_embds: freeze_token_type_embeddings(student, args) - ## SANITY CHECKS ## assert student.config.vocab_size == teacher.config.vocab_size assert student.config.hidden_size == teacher.config.hidden_size @@ -274,14 +299,11 @@ def main(): if args.mlm: assert token_probs.size(0) == stu_architecture_config.vocab_size - ## DISTILLER ## torch.cuda.empty_cache() - distiller = Distiller(params=args, - dataset=train_lm_seq_dataset, - token_probs=token_probs, - student=student, - teacher=teacher) + distiller = Distiller( + params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher + ) distiller.train() logger.info("Let's go get some drinks.") diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py index 3d62504710..f9d7412cb5 100644 --- a/examples/distillation/utils.py +++ b/examples/distillation/utils.py @@ -23,9 +23,12 @@ import torch import numpy as np import logging -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - PID: %(process)d - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) logger = logging.getLogger(__name__) @@ -35,12 +38,12 @@ def git_log(folder_path: str): """ repo = git.Repo(search_parent_directories=True) repo_infos = { - 'repo_id': str(repo), - 'repo_sha': str(repo.head.object.hexsha), - 'repo_branch': str(repo.active_branch) + "repo_id": str(repo), + "repo_sha": str(repo.head.object.hexsha), + "repo_branch": str(repo.active_branch), } - with open(os.path.join(folder_path, 'git_log.json'), 'w') as f: + with open(os.path.join(folder_path, "git_log.json"), "w") as f: json.dump(repo_infos, f, indent=4) @@ -57,21 +60,21 @@ def init_gpu_params(params): assert torch.cuda.is_available() - logger.info('Initializing GPUs') + logger.info("Initializing GPUs") if params.n_gpu > 1: assert params.local_rank != -1 - params.world_size = int(os.environ['WORLD_SIZE']) - params.n_gpu_per_node = int(os.environ['N_GPU_NODE']) - params.global_rank = int(os.environ['RANK']) + params.world_size = int(os.environ["WORLD_SIZE"]) + params.n_gpu_per_node = int(os.environ["N_GPU_NODE"]) + params.global_rank = int(os.environ["RANK"]) # number of nodes / node ID params.n_nodes = params.world_size // params.n_gpu_per_node params.node_id = params.global_rank // params.n_gpu_per_node params.multi_gpu = True - assert params.n_nodes == int(os.environ['N_NODES']) - assert params.node_id == int(os.environ['NODE_RANK']) + assert params.n_nodes == int(os.environ["N_NODES"]) + assert params.node_id == int(os.environ["NODE_RANK"]) # local job (single GPU) else: @@ -114,8 +117,7 @@ def init_gpu_params(params): if params.multi_gpu: logger.info("Initializing PyTorch distributed") torch.distributed.init_process_group( - init_method='env://', - backend='nccl', + init_method="env://", backend="nccl", ) diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py index f4a44bf62a..c92dbd3d36 100644 --- a/examples/mm-imdb/run_mmimdb.py +++ b/examples/mm-imdb/run_mmimdb.py @@ -40,29 +40,49 @@ from tqdm import tqdm, trange from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_mmimdb_labels, get_image_transforms -from transformers import (WEIGHTS_NAME, - BertConfig, BertModel, BertTokenizer, - RobertaConfig, RobertaModel, RobertaTokenizer, - XLMConfig, XLMModel, XLMTokenizer, - XLNetConfig, XLNetModel, XLNetTokenizer, - DistilBertConfig, DistilBertModel, DistilBertTokenizer, - AlbertConfig, AlbertModel, AlbertTokenizer, - MMBTForClassification, MMBTConfig) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertModel, + BertTokenizer, + RobertaConfig, + RobertaModel, + RobertaTokenizer, + XLMConfig, + XLMModel, + XLMTokenizer, + XLNetConfig, + XLNetModel, + XLNetTokenizer, + DistilBertConfig, + DistilBertModel, + DistilBertTokenizer, + AlbertConfig, + AlbertModel, + AlbertTokenizer, + MMBTForClassification, + MMBTConfig, +) from transformers import AdamW, get_linear_schedule_with_warmup logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, - RobertaConfig, DistilBertConfig)), ()) +ALL_MODELS = sum( + ( + tuple(conf.pretrained_config_archive_map.keys()) + for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig) + ), + (), +) MODEL_CLASSES = { - 'bert': (BertConfig, BertModel, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetModel, XLNetTokenizer), - 'xlm': (XLMConfig, XLMModel, XLMTokenizer), - 'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer), - 'distilbert': (DistilBertConfig, DistilBertModel, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertModel, AlbertTokenizer) + "bert": (BertConfig, BertModel, BertTokenizer), + "xlnet": (XLNetConfig, XLNetModel, XLNetTokenizer), + "xlm": (XLMConfig, XLMModel, XLMTokenizer), + "roberta": (RobertaConfig, RobertaModel, RobertaTokenizer), + "distilbert": (DistilBertConfig, DistilBertModel, DistilBertTokenizer), + "albert": (AlbertConfig, AlbertModel, AlbertTokenizer), } @@ -81,10 +101,13 @@ def train(args, train_dataset, model, tokenizer, criterion): args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader(train_dataset, sampler=train_sampler, - batch_size=args.train_batch_size, - collate_fn=collate_fn, - num_workers=args.num_workers) + train_dataloader = DataLoader( + train_dataset, + sampler=train_sampler, + batch_size=args.train_batch_size, + collate_fn=collate_fn, + num_workers=args.num_workers, + ) if args.max_steps > 0: t_total = args.max_steps @@ -93,14 +116,19 @@ def train(args, train_dataset, model, tokenizer, criterion): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) if args.fp16: try: from apex import amp @@ -114,17 +142,21 @@ def train(args, train_dataset, model, tokenizer, criterion): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -140,17 +172,19 @@ def train(args, train_dataset, model, tokenizer, criterion): model.train() batch = tuple(t.to(args.device) for t in batch) labels = batch[5] - inputs = {'input_ids': batch[0], - 'input_modal': batch[2], - 'attention_mask': batch[1], - 'modal_start_tokens': batch[3], - 'modal_end_tokens': batch[4]} + inputs = { + "input_ids": batch[0], + "input_modal": batch[2], + "attention_mask": batch[1], + "modal_start_tokens": batch[3], + "modal_end_tokens": batch[4], + } outputs = model(**inputs) logits = outputs[0] # model outputs are always tuple in transformers (see doc) loss = criterion(logits, labels) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -174,30 +208,34 @@ def train(args, train_dataset, model, tokenizer, criterion): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, criterion) for key, value in results.items(): - eval_key = 'eval_{}'.format(key) + eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] - logs['learning_rate'] = learning_rate_scalar - logs['loss'] = loss_scalar + logs["learning_rate"] = learning_rate_scalar + logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) - print(json.dumps({**logs, **{'step': global_step}})) + print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), os.path.join(output_dir, WEIGHTS_NAME)) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -209,13 +247,13 @@ def train(args, train_dataset, model, tokenizer, criterion): if args.local_rank == -1: results = evaluate(args, model, tokenizer, criterion) - if results['micro_f1'] > best_f1: - best_f1 = results['micro_f1'] + if results["micro_f1"] > best_f1: + best_f1 = results["micro_f1"] n_no_improve = 0 else: n_no_improve += 1 - if n_no_improve > args.patience: + if n_no_improve > args.patience: train_iterator.close() break @@ -236,7 +274,9 @@ def evaluate(args, model, tokenizer, criterion, prefix=""): args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) - eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn) + eval_dataloader = DataLoader( + eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate_fn + ) # multi-gpu eval if args.n_gpu > 1: @@ -257,11 +297,13 @@ def evaluate(args, model, tokenizer, criterion, prefix=""): with torch.no_grad(): batch = tuple(t.to(args.device) for t in batch) labels = batch[5] - inputs = {'input_ids': batch[0], - 'input_modal': batch[2], - 'attention_mask': batch[1], - 'modal_start_tokens': batch[3], - 'modal_end_tokens': batch[4]} + inputs = { + "input_ids": batch[0], + "input_modal": batch[2], + "attention_mask": batch[1], + "modal_start_tokens": batch[3], + "modal_end_tokens": batch[4], + } outputs = model(**inputs) logits = outputs[0] # model outputs are always tuple in transformers (see doc) tmp_eval_loss = criterion(logits, labels) @@ -278,7 +320,7 @@ def evaluate(args, model, tokenizer, criterion, prefix=""): result = { "loss": eval_loss, "macro_f1": f1_score(out_label_ids, preds, average="macro"), - "micro_f1": f1_score(out_label_ids, preds, average="micro") + "micro_f1": f1_score(out_label_ids, preds, average="micro"), } output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") @@ -303,94 +345,147 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .jsonl files for MMIMDB.") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .jsonl files for MMIMDB.", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--num_image_embeds", default=1, type=int, - help="Number of Image Embeddings from the Image Encoder") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", + ) + parser.add_argument( + "--num_image_embeds", default=1, type=int, help="Number of Image Embeddings from the Image Encoder" + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--patience", default=5, type=int, - help="Patience for Early Stopping.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument("--patience", default=5, type=int, help="Patience for Early Stopping.") + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--num_workers', type=int, default=8, - help="number of worker threads for dataloading") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument("--num_workers", type=int, default=8, help="number of worker threads for dataloading") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") - parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -402,17 +497,25 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -426,13 +529,17 @@ def main(): num_labels = len(labels) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - transformer_config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - transformer = model_class.from_pretrained(args.model_name_or_path, - config=transformer_config, - cache_dir=args.cache_dir if args.cache_dir else None) + transformer_config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + transformer = model_class.from_pretrained( + args.model_name_or_path, config=transformer_config, cache_dir=args.cache_dir if args.cache_dir else None + ) img_encoder = ImageEncoder(args) config = MMBTConfig(transformer_config, num_labels=num_labels) model = MMBTForClassification(config, transformer, img_encoder) @@ -449,12 +556,13 @@ def main(): train_dataset = load_examples(args, tokenizer, evaluate=False) label_frequences = train_dataset.get_label_frequencies() label_frequences = [label_frequences[l] for l in labels] - label_weights = (torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset)) ** -1 + label_weights = ( + torch.tensor(label_frequences, device=args.device, dtype=torch.float) / len(train_dataset) + ) ** -1 criterion = nn.BCEWithLogitsLoss(pos_weight=label_weights) global_step, tr_loss = train(args, train_dataset, model, tokenizer, criterion) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -464,12 +572,14 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), os.path.join(args.output_dir, WEIGHTS_NAME)) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = MMBTForClassification(config, transformer, img_encoder) @@ -477,24 +587,25 @@ def main(): tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) - # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = MMBTForClassification(config, transformer, img_encoder) model.load_state_dict(torch.load(checkpoint)) model.to(args.device) result = evaluate(args, model, tokenizer, criterion, prefix=prefix) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results diff --git a/examples/mm-imdb/utils_mmimdb.py b/examples/mm-imdb/utils_mmimdb.py index c59da02642..57cee25f9d 100644 --- a/examples/mm-imdb/utils_mmimdb.py +++ b/examples/mm-imdb/utils_mmimdb.py @@ -25,17 +25,7 @@ import torchvision import torchvision.transforms as transforms from torch.utils.data import Dataset -POOLING_BREAKDOWN = { - 1: (1, 1), - 2: (2, 1), - 3: (3, 1), - 4: (2, 2), - 5: (5, 1), - 6: (3, 2), - 7: (7, 1), - 8: (4, 2), - 9: (3, 3) -} +POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)} class ImageEncoder(nn.Module): @@ -54,7 +44,6 @@ class ImageEncoder(nn.Module): return out # BxNx2048 - class JsonlDataset(Dataset): def __init__(self, data_path, tokenizer, transforms, labels, max_seq_length): self.data = [json.loads(l) for l in open(data_path)] @@ -72,7 +61,7 @@ class JsonlDataset(Dataset): def __getitem__(self, index): sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True)) start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1] - sentence = sentence[:self.max_seq_length] + sentence = sentence[: self.max_seq_length] label = torch.zeros(self.n_classes) label[[self.labels.index(tgt) for tgt in self.data[index]["label"]]] = 1 @@ -80,8 +69,13 @@ class JsonlDataset(Dataset): image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB") image = self.transforms(image) - return {"image_start_token": start_token, "image_end_token": end_token, - "sentence": sentence, "image": image, "label": label} + return { + "image_start_token": start_token, + "image_end_token": end_token, + "sentence": sentence, + "image": image, + "label": label, + } def get_label_frequencies(self): label_freqs = Counter() @@ -110,10 +104,31 @@ def collate_fn(batch): def get_mmimdb_labels(): - return ['Crime', 'Drama', 'Thriller', 'Action', 'Comedy', 'Romance', - 'Documentary', 'Short', 'Mystery', 'History', 'Family', 'Adventure', - 'Fantasy', 'Sci-Fi', 'Western', 'Horror', 'Sport', 'War', 'Music', - 'Musical', 'Animation', 'Biography', 'Film-Noir'] + return [ + "Crime", + "Drama", + "Thriller", + "Action", + "Comedy", + "Romance", + "Documentary", + "Short", + "Mystery", + "History", + "Family", + "Adventure", + "Fantasy", + "Sci-Fi", + "Western", + "Horror", + "Sport", + "War", + "Music", + "Musical", + "Animation", + "Biography", + "Film-Noir", + ] def get_image_transforms(): @@ -122,9 +137,6 @@ def get_image_transforms(): transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), - transforms.Normalize( - mean=[0.46777044, 0.44531429, 0.40661017], - std=[0.12221994, 0.12145835, 0.14380469], - ), + transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],), ] ) diff --git a/examples/pplm/pplm_classification_head.py b/examples/pplm/pplm_classification_head.py index 9aae0f17e9..05621c3bf2 100644 --- a/examples/pplm/pplm_classification_head.py +++ b/examples/pplm/pplm_classification_head.py @@ -1,5 +1,6 @@ import torch + class ClassificationHead(torch.nn.Module): """Classification Head for transformer encoders""" diff --git a/examples/pplm/run_pplm.py b/examples/pplm/run_pplm.py index 095dc39a74..37183a5121 100644 --- a/examples/pplm/run_pplm.py +++ b/examples/pplm/run_pplm.py @@ -1,19 +1,19 @@ #! /usr/bin/env python3 # coding=utf-8 -#Copyright (c) 2019 Uber Technologies, Inc. +# Copyright (c) 2019 Uber Technologies, Inc. # -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -#http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ Example command with bag of words: @@ -46,13 +46,13 @@ SMALL_CONST = 1e-15 BIG_CONST = 1e10 BAG_OF_WORDS_ARCHIVE_MAP = { - 'legal': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt", - 'military': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt", - 'politics': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt", - 'religion': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt", - 'science': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt", - 'space': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt", - 'technology': "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt", + "legal": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/legal.txt", + "military": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/military.txt", + "politics": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/politics.txt", + "religion": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/religion.txt", + "science": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/science.txt", + "space": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/space.txt", + "technology": "https://s3.amazonaws.com/models.huggingface.co/bert/pplm/bow/technology.txt", } DISCRIMINATOR_MODELS_PARAMS = { @@ -75,10 +75,10 @@ DISCRIMINATOR_MODELS_PARAMS = { } -def to_var(x, requires_grad=False, volatile=False, device='cuda'): - if torch.cuda.is_available() and device == 'cuda': +def to_var(x, requires_grad=False, volatile=False, device="cuda"): + if torch.cuda.is_available() and device == "cuda": x = x.cuda() - elif device != 'cuda': + elif device != "cuda": x = x.to(device) return Variable(x, requires_grad=requires_grad, volatile=volatile) @@ -95,49 +95,39 @@ def top_k_filter(logits, k, probs=False): values = torch.topk(logits, k)[0] batch_mins = values[:, -1].view(-1, 1).expand_as(logits) if probs: - return torch.where(logits < batch_mins, - torch.ones_like(logits) * 0.0, logits) - return torch.where(logits < batch_mins, - torch.ones_like(logits) * -BIG_CONST, - logits) + return torch.where(logits < batch_mins, torch.ones_like(logits) * 0.0, logits) + return torch.where(logits < batch_mins, torch.ones_like(logits) * -BIG_CONST, logits) def perturb_past( - past, - model, - last, - unpert_past=None, - unpert_logits=None, - accumulated_hidden=None, - grad_norms=None, - stepsize=0.01, - one_hot_bows_vectors=None, - classifier=None, - class_label=None, - loss_type=0, - num_iterations=3, - horizon_length=1, - window_length=0, - decay=False, - gamma=1.5, - kl_scale=0.01, - device='cuda', + past, + model, + last, + unpert_past=None, + unpert_logits=None, + accumulated_hidden=None, + grad_norms=None, + stepsize=0.01, + one_hot_bows_vectors=None, + classifier=None, + class_label=None, + loss_type=0, + num_iterations=3, + horizon_length=1, + window_length=0, + decay=False, + gamma=1.5, + kl_scale=0.01, + device="cuda", ): # Generate inital perturbed past - grad_accumulator = [ - (np.zeros(p.shape).astype("float32")) - for p in past - ] + grad_accumulator = [(np.zeros(p.shape).astype("float32")) for p in past] if accumulated_hidden is None: accumulated_hidden = 0 if decay: - decay_mask = torch.arange( - 0., - 1.0 + SMALL_CONST, - 1.0 / (window_length) - )[1:] + decay_mask = torch.arange(0.0, 1.0 + SMALL_CONST, 1.0 / (window_length))[1:] else: decay_mask = 1.0 @@ -146,26 +136,17 @@ def perturb_past( _, _, _, curr_length, _ = past[0].shape if curr_length > window_length and window_length > 0: - ones_key_val_shape = ( - tuple(past[0].shape[:-2]) - + tuple([window_length]) - + tuple(past[0].shape[-1:]) - ) + ones_key_val_shape = tuple(past[0].shape[:-2]) + tuple([window_length]) + tuple(past[0].shape[-1:]) zeros_key_val_shape = ( - tuple(past[0].shape[:-2]) - + tuple([curr_length - window_length]) - + tuple(past[0].shape[-1:]) + tuple(past[0].shape[:-2]) + tuple([curr_length - window_length]) + tuple(past[0].shape[-1:]) ) ones_mask = torch.ones(ones_key_val_shape) ones_mask = decay_mask * ones_mask.permute(0, 1, 2, 4, 3) ones_mask = ones_mask.permute(0, 1, 2, 4, 3) - window_mask = torch.cat( - (ones_mask, torch.zeros(zeros_key_val_shape)), - dim=-2 - ).to(device) + window_mask = torch.cat((ones_mask, torch.zeros(zeros_key_val_shape)), dim=-2).to(device) else: window_mask = torch.ones_like(past[0]).to(device) @@ -175,8 +156,7 @@ def perturb_past( for i in range(num_iterations): print("Iteration ", i + 1) curr_perturbation = [ - to_var(torch.from_numpy(p_), requires_grad=True, device=device) - for p_ in grad_accumulator + to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator ] # Compute hidden using perturbed past @@ -184,10 +164,7 @@ def perturb_past( _, _, _, curr_length, _ = curr_perturbation[0].shape all_logits, _, all_hidden = model(last, past=perturbed_past) hidden = all_hidden[-1] - new_accumulated_hidden = accumulated_hidden + torch.sum( - hidden, - dim=1 - ).detach() + new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach() # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth) logits = all_logits[:, -1, :] probs = F.softmax(logits, dim=-1) @@ -210,20 +187,13 @@ def perturb_past( wte = model.resize_token_embeddings() for _ in range(horizon_length): inputs_embeds = torch.matmul(curr_probs, wte.weight.data) - _, curr_unpert_past, curr_all_hidden = model( - past=curr_unpert_past, - inputs_embeds=inputs_embeds - ) + _, curr_unpert_past, curr_all_hidden = model(past=curr_unpert_past, inputs_embeds=inputs_embeds) curr_hidden = curr_all_hidden[-1] - new_accumulated_hidden = new_accumulated_hidden + torch.sum( - curr_hidden, dim=1) + new_accumulated_hidden = new_accumulated_hidden + torch.sum(curr_hidden, dim=1) - prediction = classifier(new_accumulated_hidden / - (curr_length + 1 + horizon_length)) + prediction = classifier(new_accumulated_hidden / (curr_length + 1 + horizon_length)) - label = torch.tensor(prediction.shape[0] * [class_label], - device=device, - dtype=torch.long) + label = torch.tensor(prediction.shape[0] * [class_label], device=device, dtype=torch.long) discrim_loss = ce_loss(prediction, label) print(" pplm_discrim_loss:", discrim_loss.data.cpu().numpy()) loss += discrim_loss @@ -232,21 +202,15 @@ def perturb_past( kl_loss = 0.0 if kl_scale > 0.0: unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1) - unpert_probs = ( - unpert_probs + SMALL_CONST * - (unpert_probs <= SMALL_CONST).float().to(device).detach() - ) - correction = SMALL_CONST * (probs <= SMALL_CONST).float().to( - device).detach() + unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach() + correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach() corrected_probs = probs + correction.detach() - kl_loss = kl_scale * ( - (corrected_probs * (corrected_probs / unpert_probs).log()).sum() - ) - print(' kl_loss', kl_loss.data.cpu().numpy()) + kl_loss = kl_scale * ((corrected_probs * (corrected_probs / unpert_probs).log()).sum()) + print(" kl_loss", kl_loss.data.cpu().numpy()) loss += kl_loss loss_per_iter.append(loss.data.cpu().numpy()) - print(' pplm_loss', (loss - kl_loss).data.cpu().numpy()) + print(" pplm_loss", (loss - kl_loss).data.cpu().numpy()) # compute gradients loss.backward() @@ -259,15 +223,12 @@ def perturb_past( ] else: grad_norms = [ - (torch.norm(p_.grad * window_mask) + SMALL_CONST) - for index, p_ in enumerate(curr_perturbation) + (torch.norm(p_.grad * window_mask) + SMALL_CONST) for index, p_ in enumerate(curr_perturbation) ] # normalize gradients grad = [ - -stepsize * - (p_.grad * window_mask / grad_norms[ - index] ** gamma).data.cpu().numpy() + -stepsize * (p_.grad * window_mask / grad_norms[index] ** gamma).data.cpu().numpy() for index, p_ in enumerate(curr_perturbation) ] @@ -285,36 +246,27 @@ def perturb_past( past = new_past # apply the accumulated perturbations to the past - grad_accumulator = [ - to_var(torch.from_numpy(p_), requires_grad=True, device=device) - for p_ in grad_accumulator - ] + grad_accumulator = [to_var(torch.from_numpy(p_), requires_grad=True, device=device) for p_ in grad_accumulator] pert_past = list(map(add, past, grad_accumulator)) return pert_past, new_accumulated_hidden, grad_norms, loss_per_iter def get_classifier( - name: Optional[str], class_label: Union[str, int], - device: str + name: Optional[str], class_label: Union[str, int], device: str ) -> Tuple[Optional[ClassificationHead], Optional[int]]: if name is None: return None, None params = DISCRIMINATOR_MODELS_PARAMS[name] - classifier = ClassificationHead( - class_size=params['class_size'], - embed_size=params['embed_size'] - ).to(device) + classifier = ClassificationHead(class_size=params["class_size"], embed_size=params["embed_size"]).to(device) if "url" in params: resolved_archive_file = cached_path(params["url"]) elif "path" in params: resolved_archive_file = params["path"] else: - raise ValueError("Either url or path have to be specified " - "in the discriminator model parameters") - classifier.load_state_dict( - torch.load(resolved_archive_file, map_location=device)) + raise ValueError("Either url or path have to be specified " "in the discriminator model parameters") + classifier.load_state_dict(torch.load(resolved_archive_file, map_location=device)) classifier.eval() if isinstance(class_label, str): @@ -341,8 +293,7 @@ def get_classifier( return classifier, label_id -def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> \ - List[List[List[int]]]: +def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> List[List[List[int]]]: bow_indices = [] for id_or_path in bag_of_words_ids_or_paths: if id_or_path in BAG_OF_WORDS_ARCHIVE_MAP: @@ -351,13 +302,11 @@ def get_bag_of_words_indices(bag_of_words_ids_or_paths: List[str], tokenizer) -> filepath = id_or_path with open(filepath, "r") as f: words = f.read().strip().split("\n") - bow_indices.append( - [tokenizer.encode(word.strip(), add_prefix_space=True) for word in - words]) + bow_indices.append([tokenizer.encode(word.strip(), add_prefix_space=True) for word in words]) return bow_indices -def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'): +def build_bows_one_hot_vectors(bow_indices, tokenizer, device="cuda"): if bow_indices is None: return None @@ -373,39 +322,34 @@ def build_bows_one_hot_vectors(bow_indices, tokenizer, device='cuda'): def full_text_generation( - model, - tokenizer, - context=None, - num_samples=1, - device="cuda", - bag_of_words=None, - discrim=None, - class_label=None, - length=100, - stepsize=0.02, - temperature=1.0, - top_k=10, - sample=False, - num_iterations=3, - grad_length=10000, - horizon_length=1, - window_length=0, - decay=False, - gamma=1.5, - gm_scale=0.9, - kl_scale=0.01, - **kwargs + model, + tokenizer, + context=None, + num_samples=1, + device="cuda", + bag_of_words=None, + discrim=None, + class_label=None, + length=100, + stepsize=0.02, + temperature=1.0, + top_k=10, + sample=False, + num_iterations=3, + grad_length=10000, + horizon_length=1, + window_length=0, + decay=False, + gamma=1.5, + gm_scale=0.9, + kl_scale=0.01, + **kwargs ): - classifier, class_id = get_classifier( - discrim, - class_label, - device - ) + classifier, class_id = get_classifier(discrim, class_label, device) bow_indices = [] if bag_of_words: - bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), - tokenizer) + bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) if bag_of_words and classifier: print("Both PPLM-BoW and PPLM-Discrim are on. This is not optimized.") @@ -423,15 +367,9 @@ def full_text_generation( raise Exception("Specify either a bag of words or a discriminator") unpert_gen_tok_text, _, _ = generate_text_pplm( - model=model, - tokenizer=tokenizer, - context=context, - device=device, - length=length, - sample=sample, - perturb=False + model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False ) - if device == 'cuda': + if device == "cuda": torch.cuda.empty_cache() pert_gen_tok_texts = [] @@ -468,36 +406,36 @@ def full_text_generation( discrim_losses.append(discrim_loss.data.cpu().numpy()) losses_in_time.append(loss_in_time) - if device == 'cuda': + if device == "cuda": torch.cuda.empty_cache() return unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time def generate_text_pplm( - model, - tokenizer, - context=None, - past=None, - device="cuda", - perturb=True, - bow_indices=None, - classifier=None, - class_label=None, - loss_type=0, - length=100, - stepsize=0.02, - temperature=1.0, - top_k=10, - sample=False, - num_iterations=3, - grad_length=10000, - horizon_length=1, - window_length=0, - decay=False, - gamma=1.5, - gm_scale=0.9, - kl_scale=0.01, + model, + tokenizer, + context=None, + past=None, + device="cuda", + perturb=True, + bow_indices=None, + classifier=None, + class_label=None, + loss_type=0, + length=100, + stepsize=0.02, + temperature=1.0, + top_k=10, + sample=False, + num_iterations=3, + grad_length=10000, + horizon_length=1, + window_length=0, + decay=False, + gamma=1.5, + gm_scale=0.9, + kl_scale=0.01, ): output_so_far = None if context: @@ -507,8 +445,7 @@ def generate_text_pplm( output_so_far = context_t # collect one hot vectors for bags of words - one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, - device) + one_hot_bows_vectors = build_bows_one_hot_vectors(bow_indices, tokenizer, device) grad_norms = None last = None @@ -575,13 +512,9 @@ def generate_text_pplm( if classifier is not None: ce_loss = torch.nn.CrossEntropyLoss() prediction = classifier(torch.mean(unpert_last_hidden, dim=1)) - label = torch.tensor([class_label], device=device, - dtype=torch.long) + label = torch.tensor([class_label], device=device, dtype=torch.long) unpert_discrim_loss = ce_loss(prediction, label) - print( - "unperturbed discrim loss", - unpert_discrim_loss.data.cpu().numpy() - ) + print("unperturbed discrim loss", unpert_discrim_loss.data.cpu().numpy()) else: unpert_discrim_loss = 0 @@ -590,10 +523,8 @@ def generate_text_pplm( unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1) - pert_probs = ((pert_probs ** gm_scale) * ( - unpert_probs ** (1 - gm_scale))) # + SMALL_CONST - pert_probs = top_k_filter(pert_probs, k=top_k, - probs=True) # + SMALL_CONST + pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale)) # + SMALL_CONST + pert_probs = top_k_filter(pert_probs, k=top_k, probs=True) # + SMALL_CONST # rescale if torch.sum(pert_probs) <= 1: @@ -611,10 +542,7 @@ def generate_text_pplm( _, last = torch.topk(pert_probs, k=1, dim=-1) # update context/output_so_far appending the new token - output_so_far = ( - last if output_so_far is None - else torch.cat((output_so_far, last), dim=1) - ) + output_so_far = last if output_so_far is None else torch.cat((output_so_far, last), dim=1) print(tokenizer.decode(output_so_far.tolist()[0])) @@ -623,44 +551,42 @@ def generate_text_pplm( def set_generic_model_params(discrim_weights, discrim_meta): if discrim_weights is None: - raise ValueError('When using a generic discriminator, ' - 'discrim_weights need to be specified') + raise ValueError("When using a generic discriminator, " "discrim_weights need to be specified") if discrim_meta is None: - raise ValueError('When using a generic discriminator, ' - 'discrim_meta need to be specified') + raise ValueError("When using a generic discriminator, " "discrim_meta need to be specified") - with open(discrim_meta, 'r') as discrim_meta_file: + with open(discrim_meta, "r") as discrim_meta_file: meta = json.load(discrim_meta_file) - meta['path'] = discrim_weights - DISCRIMINATOR_MODELS_PARAMS['generic'] = meta + meta["path"] = discrim_weights + DISCRIMINATOR_MODELS_PARAMS["generic"] = meta def run_pplm_example( - pretrained_model="gpt2-medium", - cond_text="", - uncond=False, - num_samples=1, - bag_of_words=None, - discrim=None, - discrim_weights=None, - discrim_meta=None, - class_label=-1, - length=100, - stepsize=0.02, - temperature=1.0, - top_k=10, - sample=False, - num_iterations=3, - grad_length=10000, - horizon_length=1, - window_length=0, - decay=False, - gamma=1.5, - gm_scale=0.9, - kl_scale=0.01, - seed=0, - no_cuda=False, - colorama=False + pretrained_model="gpt2-medium", + cond_text="", + uncond=False, + num_samples=1, + bag_of_words=None, + discrim=None, + discrim_weights=None, + discrim_meta=None, + class_label=-1, + length=100, + stepsize=0.02, + temperature=1.0, + top_k=10, + sample=False, + num_iterations=3, + grad_length=10000, + horizon_length=1, + window_length=0, + decay=False, + gamma=1.5, + gm_scale=0.9, + kl_scale=0.01, + seed=0, + no_cuda=False, + colorama=False, ): # set Random seed torch.manual_seed(seed) @@ -669,21 +595,15 @@ def run_pplm_example( # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" - if discrim == 'generic': + if discrim == "generic": set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: - pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ - "pretrained_model" - ] - print("discrim = {}, pretrained_model set " - "to discriminator's = {}".format(discrim, pretrained_model)) + pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim]["pretrained_model"] + print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model - model = GPT2LMHeadModel.from_pretrained( - pretrained_model, - output_hidden_states=True - ) + model = GPT2LMHeadModel.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() @@ -696,9 +616,7 @@ def run_pplm_example( # figure out conditioning text if uncond: - tokenized_cond_text = tokenizer.encode( - [tokenizer.bos_token] - ) + tokenized_cond_text = tokenizer.encode([tokenizer.bos_token]) else: raw_text = cond_text while not raw_text: @@ -750,8 +668,7 @@ def run_pplm_example( bow_word_ids = set() if bag_of_words and colorama: - bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), - tokenizer) + bow_indices = get_bag_of_words_indices(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) @@ -765,13 +682,11 @@ def run_pplm_example( if colorama: import colorama - pert_gen_text = '' + pert_gen_text = "" for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: - pert_gen_text += '{}{}{}'.format( - colorama.Fore.RED, - tokenizer.decode([word_id]), - colorama.Style.RESET_ALL + pert_gen_text += "{}{}{}".format( + colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL ) else: pert_gen_text += tokenizer.decode([word_id]) @@ -785,14 +700,12 @@ def run_pplm_example( pass # keep the prefix, perturbed seq, original seq for each index - generated_texts.append( - (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text) - ) + generated_texts.append((tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)) return -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--pretrained_model", @@ -801,19 +714,10 @@ if __name__ == '__main__': default="gpt2-medium", help="pretrained model name or path to local checkpoint", ) + parser.add_argument("--cond_text", type=str, default="The lake", help="Prefix texts to condition on") + parser.add_argument("--uncond", action="store_true", help="Generate from end-of-text as prefix") parser.add_argument( - "--cond_text", type=str, default="The lake", - help="Prefix texts to condition on" - ) - parser.add_argument( - "--uncond", action="store_true", - help="Generate from end-of-text as prefix" - ) - parser.add_argument( - "--num_samples", - type=int, - default=1, - help="Number of samples to generate from the modified latents", + "--num_samples", type=int, default=1, help="Number of samples to generate from the modified latents", ) parser.add_argument( "--bag_of_words", @@ -821,8 +725,8 @@ if __name__ == '__main__': type=str, default=None, help="Bags of words used for PPLM-BoW. " - "Either a BOW id (see list in code) or a filepath. " - "Multiple BoWs separated by ;", + "Either a BOW id (see list in code) or a filepath. " + "Multiple BoWs separated by ;", ) parser.add_argument( "--discrim", @@ -832,48 +736,36 @@ if __name__ == '__main__': choices=("clickbait", "sentiment", "toxicity", "generic"), help="Discriminator to use", ) - parser.add_argument('--discrim_weights', type=str, default=None, - help='Weights for the generic discriminator') - parser.add_argument('--discrim_meta', type=str, default=None, - help='Meta information for the generic discriminator') + parser.add_argument("--discrim_weights", type=str, default=None, help="Weights for the generic discriminator") parser.add_argument( - "--class_label", - type=int, - default=-1, - help="Class label used for the discriminator", + "--discrim_meta", type=str, default=None, help="Meta information for the generic discriminator" + ) + parser.add_argument( + "--class_label", type=int, default=-1, help="Class label used for the discriminator", ) parser.add_argument("--length", type=int, default=100) parser.add_argument("--stepsize", type=float, default=0.02) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=10) - parser.add_argument( - "--sample", action="store_true", - help="Generate from end-of-text as prefix" - ) + parser.add_argument("--sample", action="store_true", help="Generate from end-of-text as prefix") parser.add_argument("--num_iterations", type=int, default=3) parser.add_argument("--grad_length", type=int, default=10000) parser.add_argument( "--window_length", type=int, default=0, - help="Length of past which is being optimized; " - "0 corresponds to infinite window length", + help="Length of past which is being optimized; " "0 corresponds to infinite window length", ) parser.add_argument( - "--horizon_length", - type=int, - default=1, - help="Length of future to optimize over", + "--horizon_length", type=int, default=1, help="Length of future to optimize over", ) - parser.add_argument("--decay", action="store_true", - help="whether to decay or not") + parser.add_argument("--decay", action="store_true", help="whether to decay or not") parser.add_argument("--gamma", type=float, default=1.5) parser.add_argument("--gm_scale", type=float, default=0.9) parser.add_argument("--kl_scale", type=float, default=0.01) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--no_cuda", action="store_true", help="no cuda") - parser.add_argument("--colorama", action="store_true", - help="colors keywords") + parser.add_argument("--colorama", action="store_true", help="colors keywords") args = parser.parse_args() run_pplm_example(**vars(args)) diff --git a/examples/pplm/run_pplm_discrim_train.py b/examples/pplm/run_pplm_discrim_train.py index 3055139d8c..14136c4c7a 100644 --- a/examples/pplm/run_pplm_discrim_train.py +++ b/examples/pplm/run_pplm_discrim_train.py @@ -1,19 +1,19 @@ #! /usr/bin/env python3 # coding=utf-8 -#Copyright (c) 2019 Uber Technologies, Inc. +# Copyright (c) 2019 Uber Technologies, Inc. # -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -#http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import argparse import csv @@ -42,26 +42,15 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha max_length_seq = 100 - - class Discriminator(torch.nn.Module): """Transformer encoder followed by a Classification Head""" - def __init__( - self, - class_size, - pretrained_model="gpt2-medium", - cached_mode=False, - device='cpu' - ): + def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"): super(Discriminator, self).__init__() self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model) self.embed_size = self.encoder.transformer.config.hidden_size - self.classifier_head = ClassificationHead( - class_size=class_size, - embed_size=self.embed_size - ) + self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size) self.cached_mode = cached_mode self.device = device @@ -74,14 +63,10 @@ class Discriminator(torch.nn.Module): self.classifier_head.train() def avg_representation(self, x): - mask = x.ne(0).unsqueeze(2).repeat( - 1, 1, self.embed_size - ).float().to(self.device).detach() + mask = x.ne(0).unsqueeze(2).repeat(1, 1, self.embed_size).float().to(self.device).detach() hidden, _ = self.encoder.transformer(x) masked_hidden = hidden * mask - avg_hidden = torch.sum(masked_hidden, dim=1) / ( - torch.sum(mask, dim=1).detach() + EPSILON - ) + avg_hidden = torch.sum(masked_hidden, dim=1) / (torch.sum(mask, dim=1).detach() + EPSILON) return avg_hidden def forward(self, x): @@ -117,10 +102,7 @@ def collate_fn(data): def pad_sequences(sequences): lengths = [len(seq) for seq in sequences] - padded_sequences = torch.zeros( - len(sequences), - max(lengths) - ).long() # padding value = 0 + padded_sequences = torch.zeros(len(sequences), max(lengths)).long() # padding value = 0 for i, seq in enumerate(sequences): end = lengths[i] @@ -149,8 +131,7 @@ def cached_collate_fn(data): return x_batch, y_batch -def train_epoch(data_loader, discriminator, optimizer, - epoch=0, log_interval=10, device='cpu'): +def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10, device="cpu"): samples_so_far = 0 discriminator.train_custom() for batch_idx, (input_t, target_t) in enumerate(data_loader): @@ -169,13 +150,15 @@ def train_epoch(data_loader, discriminator, optimizer, print( "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( epoch + 1, - samples_so_far, len(data_loader.dataset), - 100 * samples_so_far / len(data_loader.dataset), loss.item() + samples_so_far, + len(data_loader.dataset), + 100 * samples_so_far / len(data_loader.dataset), + loss.item(), ) ) -def evaluate_performance(data_loader, discriminator, device='cpu'): +def evaluate_performance(data_loader, discriminator, device="cpu"): discriminator.eval() test_loss = 0 correct = 0 @@ -194,13 +177,12 @@ def evaluate_performance(data_loader, discriminator, device='cpu'): print( "Performance on test set: " "Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)".format( - test_loss, correct, len(data_loader.dataset), - 100. * correct / len(data_loader.dataset) + test_loss, correct, len(data_loader.dataset), 100.0 * correct / len(data_loader.dataset) ) ) -def predict(input_sentence, model, classes, cached=False, device='cpu'): +def predict(input_sentence, model, classes, cached=False, device="cpu"): input_t = model.tokenizer.encode(input_sentence) input_t = torch.tensor([input_t], dtype=torch.long, device=device) if cached: @@ -208,17 +190,14 @@ def predict(input_sentence, model, classes, cached=False, device='cpu'): log_probs = model(input_t).data.cpu().numpy().flatten().tolist() print("Input sentence:", input_sentence) - print("Predictions:", ", ".join( - "{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in - zip(classes, log_probs) - )) + print( + "Predictions:", + ", ".join("{}: {:.4f}".format(c, math.exp(log_prob)) for c, log_prob in zip(classes, log_probs)), + ) -def get_cached_data_loader(dataset, batch_size, discriminator, - shuffle=False, device='cpu'): - data_loader = torch.utils.data.DataLoader(dataset=dataset, - batch_size=batch_size, - collate_fn=collate_fn) +def get_cached_data_loader(dataset, batch_size, discriminator, shuffle=False, device="cpu"): + data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, collate_fn=collate_fn) xs = [] ys = [] @@ -231,50 +210,44 @@ def get_cached_data_loader(dataset, batch_size, discriminator, ys += y.cpu().numpy().tolist() data_loader = torch.utils.data.DataLoader( - dataset=Dataset(xs, ys), - batch_size=batch_size, - shuffle=shuffle, - collate_fn=cached_collate_fn) + dataset=Dataset(xs, ys), batch_size=batch_size, shuffle=shuffle, collate_fn=cached_collate_fn + ) return data_loader def train_discriminator( - dataset, dataset_fp=None, pretrained_model="gpt2-medium", - epochs=10, batch_size=64, log_interval=10, - save_model=False, cached=False, no_cuda=False): + dataset, + dataset_fp=None, + pretrained_model="gpt2-medium", + epochs=10, + batch_size=64, + log_interval=10, + save_model=False, + cached=False, + no_cuda=False, +): device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" print("Preprocessing {} dataset...".format(dataset)) start = time.time() if dataset == "SST": - idx2class = ["positive", "negative", "very positive", "very negative", - "neutral"] + idx2class = ["positive", "negative", "very positive", "very negative", "neutral"] class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator( - class_size=len(idx2class), - pretrained_model=pretrained_model, - cached_mode=cached, - device=device + class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device ).to(device) text = torchtext_data.Field() label = torchtext_data.Field(sequential=False) - train_data, val_data, test_data = datasets.SST.splits( - text, - label, - fine_grained=True, - train_subtrees=True, - ) + train_data, val_data, test_data = datasets.SST.splits(text, label, fine_grained=True, train_subtrees=True,) x = [] y = [] for i in trange(len(train_data), ascii=True): - seq = TreebankWordDetokenizer().detokenize( - vars(train_data[i])["text"] - ) + seq = TreebankWordDetokenizer().detokenize(vars(train_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) x.append(seq) @@ -284,9 +257,7 @@ def train_discriminator( test_x = [] test_y = [] for i in trange(len(test_data), ascii=True): - seq = TreebankWordDetokenizer().detokenize( - vars(test_data[i])["text"] - ) + seq = TreebankWordDetokenizer().detokenize(vars(test_data[i])["text"]) seq = discriminator.tokenizer.encode(seq) seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) test_x.append(seq) @@ -306,10 +277,7 @@ def train_discriminator( class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator( - class_size=len(idx2class), - pretrained_model=pretrained_model, - cached_mode=cached, - device=device + class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device ).to(device) with open("datasets/clickbait/clickbait_train_prefix.txt") as f: @@ -318,9 +286,7 @@ def train_discriminator( try: data.append(eval(line)) except: - print("Error evaluating line {}: {}".format( - i, line - )) + print("Error evaluating line {}: {}".format(i, line)) continue x = [] y = [] @@ -331,27 +297,20 @@ def train_discriminator( seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: - seq = torch.tensor( - [50256] + seq, device=device, dtype=torch.long - ) + seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: - print("Line {} is longer than maximum length {}".format( - i, max_length_seq - )) + print("Line {} is longer than maximum length {}".format(i, max_length_seq)) continue x.append(seq) y.append(d["label"]) except: - print("Error evaluating / tokenizing" - " line {}, skipping it".format(i)) + print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size - train_dataset, test_dataset = torch.utils.data.random_split( - full_dataset, [train_size, test_size] - ) + train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), @@ -366,10 +325,7 @@ def train_discriminator( class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator( - class_size=len(idx2class), - pretrained_model=pretrained_model, - cached_mode=cached, - device=device + class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device ).to(device) x = [] @@ -381,27 +337,20 @@ def train_discriminator( seq = discriminator.tokenizer.encode(d["text"]) if len(seq) < max_length_seq: - seq = torch.tensor( - [50256] + seq, device=device, dtype=torch.long - ) + seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: - print("Line {} is longer than maximum length {}".format( - i, max_length_seq - )) + print("Line {} is longer than maximum length {}".format(i, max_length_seq)) continue x.append(seq) y.append(int(np.sum(d["label"]) > 0)) except: - print("Error evaluating / tokenizing" - " line {}, skipping it".format(i)) + print("Error evaluating / tokenizing" " line {}, skipping it".format(i)) pass full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size - train_dataset, test_dataset = torch.utils.data.random_split( - full_dataset, [train_size, test_size] - ) + train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), @@ -416,8 +365,7 @@ def train_discriminator( # class \t text if dataset_fp is None: - raise ValueError("When generic dataset is selected, " - "dataset_fp needs to be specified aswell.") + raise ValueError("When generic dataset is selected, " "dataset_fp needs to be specified aswell.") classes = set() with open(dataset_fp) as f: @@ -430,10 +378,7 @@ def train_discriminator( class2idx = {c: i for i, c in enumerate(idx2class)} discriminator = Discriminator( - class_size=len(idx2class), - pretrained_model=pretrained_model, - cached_mode=cached, - device=device + class_size=len(idx2class), pretrained_model=pretrained_model, cached_mode=cached, device=device ).to(device) x = [] @@ -447,18 +392,11 @@ def train_discriminator( try: seq = discriminator.tokenizer.encode(text) - if (len(seq) < max_length_seq): - seq = torch.tensor( - [50256] + seq, - device=device, - dtype=torch.long - ) + if len(seq) < max_length_seq: + seq = torch.tensor([50256] + seq, device=device, dtype=torch.long) else: - print( - "Line {} is longer than maximum length {}".format( - i, max_length_seq - )) + print("Line {} is longer than maximum length {}".format(i, max_length_seq)) continue x.append(seq) @@ -471,10 +409,7 @@ def train_discriminator( full_dataset = Dataset(x, y) train_size = int(0.9 * len(full_dataset)) test_size = len(full_dataset) - train_size - train_dataset, test_dataset = torch.utils.data.random_split( - full_dataset, - [train_size, test_size] - ) + train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size]) discriminator_meta = { "class_size": len(idx2class), @@ -485,9 +420,7 @@ def train_discriminator( } end = time.time() - print("Preprocessed {} data points".format( - len(train_dataset) + len(test_dataset)) - ) + print("Preprocessed {} data points".format(len(train_dataset) + len(test_dataset))) print("Data preprocessing took: {:.3f}s".format(end - start)) if cached: @@ -495,30 +428,21 @@ def train_discriminator( start = time.time() - train_loader = get_cached_data_loader( - train_dataset, batch_size, discriminator, - shuffle=True, device=device - ) + train_loader = get_cached_data_loader(train_dataset, batch_size, discriminator, shuffle=True, device=device) - test_loader = get_cached_data_loader( - test_dataset, batch_size, discriminator, device=device - ) + test_loader = get_cached_data_loader(test_dataset, batch_size, discriminator, device=device) end = time.time() print("Building representation cache took: {:.3f}s".format(end - start)) else: - train_loader = torch.utils.data.DataLoader(dataset=train_dataset, - batch_size=batch_size, - shuffle=True, - collate_fn=collate_fn) - test_loader = torch.utils.data.DataLoader(dataset=test_dataset, - batch_size=batch_size, - collate_fn=collate_fn) + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn + ) + test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=collate_fn) if save_model: - with open("{}_classifier_head_meta.json".format(dataset), - "w") as meta_file: + with open("{}_classifier_head_meta.json".format(dataset), "w") as meta_file: json.dump(discriminator_meta, meta_file) optimizer = optim.Adam(discriminator.parameters(), lr=0.0001) @@ -533,56 +457,61 @@ def train_discriminator( optimizer=optimizer, epoch=epoch, log_interval=log_interval, - device=device - ) - evaluate_performance( - data_loader=test_loader, - discriminator=discriminator, - device=device + device=device, ) + evaluate_performance(data_loader=test_loader, discriminator=discriminator, device=device) end = time.time() print("Epoch took: {:.3f}s".format(end - start)) print("\nExample prediction") - predict(example_sentence, discriminator, idx2class, - cached=cached, device=device) + predict(example_sentence, discriminator, idx2class, cached=cached, device=device) if save_model: # torch.save(discriminator.state_dict(), # "{}_discriminator_{}.pt".format( # args.dataset, epoch + 1 # )) - torch.save(discriminator.get_classifier().state_dict(), - "{}_classifier_head_epoch_{}.pt".format(dataset, - epoch + 1)) + torch.save( + discriminator.get_classifier().state_dict(), + "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1), + ) if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Train a discriminator on top of GPT-2 representations") - parser.add_argument("--dataset", type=str, default="SST", - choices=("SST", "clickbait", "toxic", "generic"), - help="dataset to train the discriminator on." - "In case of generic, the dataset is expected" - "to be a TSBV file with structure: class \\t text") - parser.add_argument("--dataset_fp", type=str, default="", - help="File path of the dataset to use. " - "Needed only in case of generic datadset") - parser.add_argument("--pretrained_model", type=str, default="gpt2-medium", - help="Pretrained model to use as encoder") - parser.add_argument("--epochs", type=int, default=10, metavar="N", - help="Number of training epochs") - parser.add_argument("--batch_size", type=int, default=64, metavar="N", - help="input batch size for training (default: 64)") - parser.add_argument("--log_interval", type=int, default=10, metavar="N", - help="how many batches to wait before logging training status") - parser.add_argument("--save_model", action="store_true", - help="whether to save the model") - parser.add_argument("--cached", action="store_true", - help="whether to cache the input representations") - parser.add_argument("--no_cuda", action="store_true", - help="use to turn off cuda") + parser = argparse.ArgumentParser(description="Train a discriminator on top of GPT-2 representations") + parser.add_argument( + "--dataset", + type=str, + default="SST", + choices=("SST", "clickbait", "toxic", "generic"), + help="dataset to train the discriminator on." + "In case of generic, the dataset is expected" + "to be a TSBV file with structure: class \\t text", + ) + parser.add_argument( + "--dataset_fp", + type=str, + default="", + help="File path of the dataset to use. " "Needed only in case of generic datadset", + ) + parser.add_argument( + "--pretrained_model", type=str, default="gpt2-medium", help="Pretrained model to use as encoder" + ) + parser.add_argument("--epochs", type=int, default=10, metavar="N", help="Number of training epochs") + parser.add_argument( + "--batch_size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)" + ) + parser.add_argument( + "--log_interval", + type=int, + default=10, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument("--save_model", action="store_true", help="whether to save the model") + parser.add_argument("--cached", action="store_true", help="whether to cache the input representations") + parser.add_argument("--no_cuda", action="store_true", help="use to turn off cuda") args = parser.parse_args() train_discriminator(**(vars(args))) diff --git a/examples/run_bertology.py b/examples/run_bertology.py index d1d05a1073..6b4739d6bd 100644 --- a/examples/run_bertology.py +++ b/examples/run_bertology.py @@ -32,10 +32,18 @@ from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subse from torch.utils.data.distributed import DistributedSampler from torch.nn import CrossEntropyLoss, MSELoss -from transformers import (WEIGHTS_NAME, - BertConfig, BertForSequenceClassification, BertTokenizer, - XLMConfig, XLMForSequenceClassification, XLMTokenizer, - XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForSequenceClassification, + BertTokenizer, + XLMConfig, + XLMForSequenceClassification, + XLMTokenizer, + XLNetConfig, + XLNetForSequenceClassification, + XLNetTokenizer, +) from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES @@ -63,7 +71,9 @@ def print_2d_tensor(tensor): logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data)) -def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None): +def compute_heads_importance( + args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None +): """ This method shows how to compute: - head attention entropy - head importance scores according to http://arxiv.org/abs/1905.10650 @@ -85,8 +95,14 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, input_ids, input_mask, segment_ids, label_ids = batch # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below) - outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask) - loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1] # Loss and logits are the first, attention the last + outputs = model( + input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask + ) + loss, logits, all_attentions = ( + outputs[0], + outputs[1], + outputs[-1], + ) # Loss and logits are the first, attention the last loss.backward() # Backpropagate to populate the gradients in the head mask if compute_entropy: @@ -113,15 +129,15 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, # Layerwise importance normalization if not args.dont_normalize_importance_by_layer: exponent = 2 - norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent) + norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent) head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20 if not args.dont_normalize_global_importance: head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min()) # Print/save matrices - np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy()) - np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy()) + np.save(os.path.join(args.output_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy()) + np.save(os.path.join(args.output_dir, "head_importance.npy"), head_importance.detach().cpu().numpy()) logger.info("Attention entropies") print_2d_tensor(attn_entropy) @@ -129,7 +145,9 @@ def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, print_2d_tensor(head_importance) logger.info("Head ranked by importance scores") head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device) - head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device) + head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange( + head_importance.numel(), device=args.device + ) head_ranks = head_ranks.view_as(head_importance) print_2d_tensor(head_ranks) @@ -150,9 +168,9 @@ def mask_heads(args, model, eval_dataloader): current_score = original_score while current_score >= original_score * args.masking_threshold: - head_mask = new_head_mask.clone() # save current head mask + head_mask = new_head_mask.clone() # save current head mask # heads from least important to most - keep only not-masked heads - head_importance[head_mask == 0.0] = float('Inf') + head_importance[head_mask == 0.0] = float("Inf") current_heads_to_mask = head_importance.view(-1).sort()[1] if len(current_heads_to_mask) <= num_to_mask: @@ -167,14 +185,21 @@ def mask_heads(args, model, eval_dataloader): print_2d_tensor(new_head_mask) # Compute metric and head importance again - _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask) + _, head_importance, preds, labels = compute_heads_importance( + args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask + ) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name] - logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100) + logger.info( + "Masking: current score: %f, remaning heads %d (%.1f percents)", + current_score, + new_head_mask.sum(), + new_head_mask.sum() / new_head_mask.numel() * 100, + ) logger.info("Final head mask") print_2d_tensor(head_mask) - np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy()) + np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy()) return head_mask @@ -186,8 +211,9 @@ def prune_heads(args, model, eval_dataloader, head_mask): # Try pruning and test time speedup # Pruning is like masking but we actually remove the masked weights before_time = datetime.now() - _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, - compute_entropy=False, compute_importance=False, head_mask=head_mask) + _, _, preds, labels = compute_heads_importance( + args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask + ) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name] original_time = datetime.now() - before_time @@ -199,73 +225,127 @@ def prune_heads(args, model, eval_dataloader, head_mask): pruned_num_params = sum(p.numel() for p in model.parameters()) before_time = datetime.now() - _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader, - compute_entropy=False, compute_importance=False, head_mask=None) + _, _, preds, labels = compute_heads_importance( + args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=None + ) preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds) score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name] new_time = datetime.now() - before_time - logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100) + logger.info( + "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", + original_num_params, + pruned_num_params, + pruned_num_params / original_num_params * 100, + ) logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) - logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100) + logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time / new_time * 100) def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join( - ALL_MODELS)) - parser.add_argument("--task_name", default=None, type=str, required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.", + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name_or_path") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name_or_path") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--data_subset", type=int, default=-1, - help="If > 0: limit the data to a subset of data_subset instances.") - parser.add_argument("--overwrite_output_dir", action='store_true', - help="Whether to overwrite data in output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") + parser.add_argument( + "--config_name", + default="", + type=str, + help="Pretrained config name or path if not the same as model_name_or_path", + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name_or_path", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances." + ) + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) - parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', - help="Don't normalize importance score by layers") - parser.add_argument("--dont_normalize_global_importance", action='store_true', - help="Don't normalize all importance scores between 0 and 1") + parser.add_argument( + "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers" + ) + parser.add_argument( + "--dont_normalize_global_importance", + action="store_true", + help="Don't normalize all importance scores between 0 and 1", + ) - parser.add_argument("--try_masking", action='store_true', - help="Whether to try to mask head until a threshold of accuracy.") - parser.add_argument("--masking_threshold", default=0.9, type=float, - help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).") - parser.add_argument("--masking_amount", default=0.1, type=float, - help="Amount to heads to masking at each masking step.") - parser.add_argument("--metric_name", default="acc", type=str, - help="Metric to use for head masking.") + parser.add_argument( + "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy." + ) + parser.add_argument( + "--masking_threshold", + default=0.9, + type=float, + help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", + ) + parser.add_argument( + "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step." + ) + parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after WordPiece tokenization. \n" - "Sequences longer than this will be truncated, sequences shorter padded.") + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, sequences shorter padded.", + ) parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") - parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -278,10 +358,10 @@ def main(): torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) args.n_gpu = 1 - torch.distributed.init_process_group(backend='nccl') # Initializes the distributed backend + torch.distributed.init_process_group(backend="nccl") # Initializes the distributed backend # Setup logging - logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1))) # Set seeds @@ -306,17 +386,23 @@ def main(): args.model_type = key # take the first match in model types break config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels, - finetuning_task=args.task_name, - output_attentions=True, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + finetuning_task=args.task_name, + output_attentions=True, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -324,14 +410,14 @@ def main(): # Distributed and parallel training model.to(args.device) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # Print/save training arguments - torch.save(args, os.path.join(args.output_dir, 'run_args.bin')) + torch.save(args, os.path.join(args.output_dir, "run_args.bin")) logger.info("Training/evaluation parameters %s", args) # Prepare dataset for the GLUE task @@ -341,11 +427,9 @@ def main(): eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) - # Compute head entropy and importance score compute_heads_importance(args, model, eval_dataloader) - # Try head masking (set heads to zero until the score goes under a threshole) # and head pruning (remove masked heads and see the effect on the network) if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: @@ -353,5 +437,5 @@ def main(): prune_heads(args, model, eval_dataloader, head_mask) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/run_generation.py b/examples/run_generation.py index 536d4a18f0..e62ccf87c6 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -33,9 +33,7 @@ from transformers import XLMWithLMHeadModel, XLMTokenizer logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger = logging.getLogger(__name__) @@ -71,6 +69,7 @@ def set_seed(args): if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) + # # Functions to prepare models' input # @@ -78,15 +77,11 @@ def set_seed(args): def prepare_ctrl_input(args, _, tokenizer, prompt_text): if args.temperature > 0.7: - logger.info( - "CTRL typically works better with lower temperatures (and lower top_k)." - ) + logger.info("CTRL typically works better with lower temperatures (and lower top_k).") encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False) if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()): - logger.info( - "WARNING! You are not starting your generation from a control code so you won't get good results" - ) + logger.info("WARNING! You are not starting your generation from a control code so you won't get good results") return prompt_text @@ -102,11 +97,7 @@ def prepare_xlm_input(args, model, tokenizer, prompt_text): else: language = None while language not in available_languages: - language = input( - "Using XLM. Select language in " - + str(list(available_languages)) - + " >>> " - ) + language = input("Using XLM. Select language in " + str(list(available_languages)) + " >>> ") # kwargs["language"] = tokenizer.lang2id[language] # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers @@ -148,17 +139,34 @@ def adjust_length_to_model(length, max_sequence_length): def main(): parser = argparse.ArgumentParser() - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) parser.add_argument("--prompt", type=str, default="") parser.add_argument("--length", type=int, default=20) parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped") - parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 1.0 has no effect, lower tend toward greedy sampling") - parser.add_argument("--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2") + parser.add_argument( + "--temperature", + type=float, + default=1.0, + help="temperature of 1.0 has no effect, lower tend toward greedy sampling", + ) + parser.add_argument( + "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2" + ) parser.add_argument("--k", type=int, default=0) parser.add_argument("--p", type=float, default=0.9) @@ -169,9 +177,7 @@ def main(): parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") args = parser.parse_args() - args.device = torch.device( - "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" - ) + args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) @@ -181,17 +187,13 @@ def main(): args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] except KeyError: - raise KeyError( - "the model {} you specified is not supported. You are welcome to add it and open a PR :)" - ) + raise KeyError("the model {} you specified is not supported. You are welcome to add it and open a PR :)") tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) model.to(args.device) - args.length = adjust_length_to_model( - args.length, max_sequence_length=model.config.max_position_embeddings - ) + args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings) logger.info(args) prompt_text = args.prompt if args.prompt else input("Model prompt >>> ") @@ -201,7 +203,7 @@ def main(): if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type) prompt_text = prepare_input(args, model, tokenizer, prompt_text) - encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt') + encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") output_sequences = model.generate( input_ids=encoded_prompt, @@ -212,7 +214,7 @@ def main(): repetition_penalty=args.repetition_penalty, ) - # Batch size == 1. to add more examples please use num_return_sequences > 1 + # Batch size == 1. to add more examples please use num_return_sequences > 1 generated_sequence = output_sequences[0].tolist() text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) text = text[: t.find(args.stop_token) if args.stop_token else None] diff --git a/examples/run_glue.py b/examples/run_glue.py index c143b6205b..bbfd52ea3d 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -26,8 +26,7 @@ import json import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler try: @@ -37,25 +36,30 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, BertConfig, - BertForSequenceClassification, BertTokenizer, - RobertaConfig, - RobertaForSequenceClassification, - RobertaTokenizer, - XLMConfig, XLMForSequenceClassification, - XLMTokenizer, XLNetConfig, - XLNetForSequenceClassification, - XLNetTokenizer, - DistilBertConfig, - DistilBertForSequenceClassification, - DistilBertTokenizer, - AlbertConfig, - AlbertForSequenceClassification, - AlbertTokenizer, - XLMRobertaConfig, - XLMRobertaForSequenceClassification, - XLMRobertaTokenizer, - ) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForSequenceClassification, + BertTokenizer, + RobertaConfig, + RobertaForSequenceClassification, + RobertaTokenizer, + XLMConfig, + XLMForSequenceClassification, + XLMTokenizer, + XLNetConfig, + XLNetForSequenceClassification, + XLNetTokenizer, + DistilBertConfig, + DistilBertForSequenceClassification, + DistilBertTokenizer, + AlbertConfig, + AlbertForSequenceClassification, + AlbertTokenizer, + XLMRobertaConfig, + XLMRobertaForSequenceClassification, + XLMRobertaTokenizer, +) from transformers import AdamW, get_linear_schedule_with_warmup @@ -66,17 +70,22 @@ from transformers import glue_convert_examples_to_features as convert_examples_t logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, - RobertaConfig, DistilBertConfig)), ()) +ALL_MODELS = sum( + ( + tuple(conf.pretrained_config_archive_map.keys()) + for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig, DistilBertConfig) + ), + (), +) MODEL_CLASSES = { - 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), - 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), - 'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer), + "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), + "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), + "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), + "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), + "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), + "albert": (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), + "xlmroberta": (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer), } @@ -104,20 +113,27 @@ def train(args, train_dataset, model, tokenizer): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) # Check if saved optimizer or scheduler states exist - if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')): + if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( + os.path.join(args.model_name_or_path, "scheduler.pt") + ): # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt'))) - scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt'))) + optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) + scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: @@ -132,17 +148,21 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -152,7 +172,7 @@ def train(args, train_dataset, model, tokenizer): # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path - global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0]) + global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) @@ -163,7 +183,9 @@ def train(args, train_dataset, model, tokenizer): tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) + train_iterator = trange( + epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] + ) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) @@ -176,16 +198,16 @@ def train(args, train_dataset, model, tokenizer): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert", "xlnet"] else None + ) # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -209,36 +231,40 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - eval_key = 'eval_{}'.format(key) + eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] - logs['learning_rate'] = learning_rate_scalar - logs['loss'] = loss_scalar + logs["learning_rate"] = learning_rate_scalar + logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) - print(json.dumps({**logs, **{'step': global_step}})) + print(json.dumps({**logs, **{"step": global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) - torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt')) - torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt')) + torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -257,7 +283,7 @@ def train(args, train_dataset, model, tokenizer): def evaluate(args, model, tokenizer, prefix=""): # Loop to handle MNLI double evaluation (matched, mis-matched) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) + eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): @@ -288,11 +314,11 @@ def evaluate(args, model, tokenizer, prefix=""): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert", "xlnet"] else None + ) # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -300,10 +326,10 @@ def evaluate(args, model, tokenizer, prefix=""): nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() + out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": @@ -330,29 +356,36 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) + cached_features_file = os.path.join( + args.data_dir, + "cached_{}_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + str(task), + ), + ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta']: + if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, + examples = ( + processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + ) + features = convert_examples_to_features( + examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) @@ -369,7 +402,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) - + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset @@ -378,90 +411,149 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--task_name", default=None, type=str, required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") - parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -473,16 +565,24 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -502,17 +602,23 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels, - finetuning_task=args.task_name, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + finetuning_task=args.task_name, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -521,14 +627,12 @@ def main(): logger.info("Training/evaluation parameters %s", args) - # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -538,36 +642,39 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) - # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index b659e229bf..60b99f29d4 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -42,37 +42,55 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, - BertConfig, BertForMaskedLM, BertTokenizer, - GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, - OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, - RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, - DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer, - CamembertConfig, CamembertForMaskedLM, CamembertTokenizer) +from transformers import ( + WEIGHTS_NAME, + AdamW, + get_linear_schedule_with_warmup, + BertConfig, + BertForMaskedLM, + BertTokenizer, + GPT2Config, + GPT2LMHeadModel, + GPT2Tokenizer, + OpenAIGPTConfig, + OpenAIGPTLMHeadModel, + OpenAIGPTTokenizer, + RobertaConfig, + RobertaForMaskedLM, + RobertaTokenizer, + DistilBertConfig, + DistilBertForMaskedLM, + DistilBertTokenizer, + CamembertConfig, + CamembertForMaskedLM, + CamembertTokenizer, +) logger = logging.getLogger(__name__) MODEL_CLASSES = { - 'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), - 'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), - 'bert': (BertConfig, BertForMaskedLM, BertTokenizer), - 'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), - 'camembert': (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer) + "gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer), + "openai-gpt": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), + "bert": (BertConfig, BertForMaskedLM, BertTokenizer), + "roberta": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer), + "distilbert": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer), + "camembert": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer), } class TextDataset(Dataset): - def __init__(self, tokenizer, args, file_path='train', block_size=512): + def __init__(self, tokenizer, args, file_path="train", block_size=512): assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) - cached_features_file = os.path.join(directory, args.model_name_or_path + '_cached_lm_' + str(block_size) + '_' + filename) + cached_features_file = os.path.join( + directory, args.model_name_or_path + "_cached_lm_" + str(block_size) + "_" + filename + ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) - with open(cached_features_file, 'rb') as handle: + with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info("Creating features from dataset file at %s", directory) @@ -83,14 +101,14 @@ class TextDataset(Dataset): tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) - for i in range(0, len(tokenized_text)-block_size+1, block_size): # Truncate in block of block_size - self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size])) + for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size + self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. logger.info("Saving features into cached file %s", cached_features_file) - with open(cached_features_file, 'wb') as handle: + with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) def __len__(self): @@ -101,7 +119,12 @@ class TextDataset(Dataset): def load_and_cache_examples(args, tokenizer, evaluate=False): - dataset = TextDataset(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size) + dataset = TextDataset( + tokenizer, + args, + file_path=args.eval_data_file if evaluate else args.train_data_file, + block_size=args.block_size, + ) return dataset @@ -120,7 +143,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False): return # Check if we should delete older checkpoint(s) - glob_checkpoints = glob.glob(os.path.join(args.output_dir, '{}-*'.format(checkpoint_prefix))) + glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix))) if len(glob_checkpoints) <= args.save_total_limit: return @@ -129,7 +152,7 @@ def _rotate_checkpoints(args, checkpoint_prefix, use_mtime=False): if use_mtime: ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) else: - regex_match = re.match('.*{}-([0-9]+)'.format(checkpoint_prefix), path) + regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path) if regex_match and regex_match.groups(): ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) @@ -147,7 +170,9 @@ def mask_tokens(inputs, tokenizer, args): labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) - special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()] + special_tokens_mask = [ + tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() + ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens @@ -181,19 +206,26 @@ def train(args, train_dataset, model, tokenizer): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) # Check if saved optimizer or scheduler states exist - if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')): + if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( + os.path.join(args.model_name_or_path, "scheduler.pt") + ): # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt'))) - scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt'))) + optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) + scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: @@ -208,17 +240,21 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -228,7 +264,7 @@ def train(args, train_dataset, model, tokenizer): # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path - global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0]) + global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) @@ -239,16 +275,18 @@ def train(args, train_dataset, model, tokenizer): tr_loss, logging_loss = 0.0, 0.0 - model_to_resize = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_resize = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() - train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) + train_iterator = trange( + epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] + ) set_seed(args) # Added here for reproducibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): - + # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 @@ -285,31 +323,35 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + tb_writer.add_scalar("eval_{}".format(key), value, global_step) + tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) + tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - checkpoint_prefix = 'checkpoint' + checkpoint_prefix = "checkpoint" # Save model checkpoint - output_dir = os.path.join(args.output_dir, '{}-{}'.format(checkpoint_prefix, global_step)) + output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) - torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt')) - torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt')) + torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -365,9 +407,7 @@ def evaluate(args, model, tokenizer, prefix=""): eval_loss = eval_loss / nb_eval_steps perplexity = torch.exp(torch.tensor(eval_loss)) - result = { - "perplexity": perplexity - } + result = {"perplexity": perplexity} output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt") with open(output_eval_file, "w") as writer: @@ -383,107 +423,167 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--train_data_file", default=None, type=str, required=True, - help="The input training data file (a text file).") - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)." + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--eval_data_file", default=None, type=str, - help="An optional input evaluation data file to evaluate the perplexity on (a text file).") + parser.add_argument( + "--eval_data_file", + default=None, + type=str, + help="An optional input evaluation data file to evaluate the perplexity on (a text file).", + ) - parser.add_argument("--model_type", default="bert", type=str, - help="The model architecture to be fine-tuned.") - parser.add_argument("--model_name_or_path", default="bert-base-cased", type=str, - help="The model checkpoint for weights initialization.") + parser.add_argument("--model_type", default="bert", type=str, help="The model architecture to be fine-tuned.") + parser.add_argument( + "--model_name_or_path", + default="bert-base-cased", + type=str, + help="The model checkpoint for weights initialization.", + ) - parser.add_argument("--mlm", action='store_true', - help="Train with masked-language modeling loss instead of language modeling.") - parser.add_argument("--mlm_probability", type=float, default=0.15, - help="Ratio of tokens to mask for masked language modeling loss") + parser.add_argument( + "--mlm", action="store_true", help="Train with masked-language modeling loss instead of language modeling." + ) + parser.add_argument( + "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss" + ) - parser.add_argument("--config_name", default="", type=str, - help="Optional pretrained config name or path if not the same as model_name_or_path") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") - parser.add_argument("--cache_dir", default="", type=str, - help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)") - parser.add_argument("--block_size", default=-1, type=int, - help="Optional input sequence length after tokenization." - "The training dataset will be truncated in block of this size for training." - "Default to the model max input length for single sentence inputs (take into account special tokens).") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Run evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--config_name", + default="", + type=str, + help="Optional pretrained config name or path if not the same as model_name_or_path", + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Optional pretrained tokenizer name or path if not the same as model_name_or_path", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)", + ) + parser.add_argument( + "--block_size", + default=-1, + type=int, + help="Optional input sequence length after tokenization." + "The training dataset will be truncated in block of this size for training." + "Default to the model max input length for single sentence inputs (take into account special tokens).", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=1.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument('--save_total_limit', type=int, default=None, - help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default') - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--save_total_limit", + type=int, + default=None, + help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", + ) + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") - parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if args.model_type in ["bert", "roberta", "distilbert", "camembert"] and not args.mlm: - raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " - "flag (masked language modeling).") + raise ValueError( + "BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm " + "flag (masked language modeling)." + ) if args.eval_data_file is None and args.do_eval: - raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " - "or remove the --do_eval argument.") + raise ValueError( + "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " + "or remove the --do_eval argument." + ) - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -495,16 +595,24 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -514,18 +622,26 @@ def main(): torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.block_size <= 0: - args.block_size = tokenizer.max_len_single_sentence # Our input block size will be the max possible for the model + args.block_size = ( + tokenizer.max_len_single_sentence + ) # Our input block size will be the max possible for the model args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) model.to(args.device) if args.local_rank == 0: @@ -546,7 +662,6 @@ def main(): global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -556,35 +671,38 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) - # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 9d1ca7f300..bfa62cfb7f 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -26,8 +26,7 @@ import random import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler try: @@ -37,34 +36,38 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, BertConfig, - BertForMultipleChoice, BertTokenizer, - XLNetConfig, XLNetForMultipleChoice, - XLNetTokenizer, RobertaConfig, - RobertaForMultipleChoice, RobertaTokenizer) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForMultipleChoice, + BertTokenizer, + XLNetConfig, + XLNetForMultipleChoice, + XLNetTokenizer, + RobertaConfig, + RobertaForMultipleChoice, + RobertaTokenizer, +) from transformers import AdamW, get_linear_schedule_with_warmup -from utils_multiple_choice import (convert_examples_to_features, processors) +from utils_multiple_choice import convert_examples_to_features, processors logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ()) +ALL_MODELS = sum( + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), () +) MODEL_CLASSES = { - 'bert': (BertConfig, BertForMultipleChoice, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer), - 'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer) + "bert": (BertConfig, BertForMultipleChoice, BertTokenizer), + "xlnet": (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer), + "roberta": (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer), } + def select_field(features, field): - return [ - [ - choice[field] - for choice in feature.choices_features - ] - for feature in features - ] + return [[choice[field] for choice in feature.choices_features] for feature in features] def simple_accuracy(preds, labels): @@ -95,13 +98,18 @@ def train(args, train_dataset, model, tokenizer): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) if args.fp16: try: from apex import amp @@ -115,17 +123,21 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -141,15 +153,19 @@ def train(args, train_dataset, model, tokenizer): for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids - 'labels': batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "token_type_ids": batch[2] + if args.model_type in ["bert", "xlnet"] + else None, # XLM don't use segment_ids + "labels": batch[3], + } outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -171,10 +187,12 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) + tb_writer.add_scalar("eval_{}".format(key), value, global_step) if results["eval_acc"] > best_dev_acc: best_dev_acc = results["eval_acc"] best_dev_loss = results["eval_loss"] @@ -182,22 +200,33 @@ def train(args, train_dataset, model, tokenizer): if args.do_test: results_test = evaluate(args, model, tokenizer, test=True) for key, value in results_test.items(): - tb_writer.add_scalar('test_{}'.format(key), value, global_step) - logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step)) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) - logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step)) + tb_writer.add_scalar("test_{}".format(key), value, global_step) + logger.info( + "test acc: %s, loss: %s, global steps: %s", + str(results_test["eval_acc"]), + str(results_test["eval_loss"]), + str(global_step), + ) + tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) + tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) + logger.info( + "Average loss: %s at global step: %s", + str((tr_loss - logging_loss) / args.logging_steps), + str(global_step), + ) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_vocabulary(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -246,10 +275,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids - 'labels': batch[3]} + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "token_type_ids": batch[2] + if args.model_type in ["bert", "xlnet"] + else None, # XLM don't use segment_ids + "labels": batch[3], + } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -257,10 +290,10 @@ def evaluate(args, model, tokenizer, prefix="", test=False): nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() + out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) @@ -273,8 +306,14 @@ def evaluate(args, model, tokenizer, prefix="", test=False): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test))) writer.write("model =%s\n" % str(args.model_name_or_path)) - writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps * - (torch.distributed.get_world_size() if args.local_rank != -1 else 1))) + writer.write( + "total batch size=%d\n" + % ( + args.per_gpu_train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1) + ) + ) writer.write("train num epochs=%d\n" % args.num_train_epochs) writer.write("fp16 =%s\n" % args.fp16) writer.write("max seq length =%d\n" % args.max_seq_length) @@ -291,17 +330,21 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): processor = processors[task]() # Load data features from cache or dataset file if evaluate: - cached_mode = 'dev' + cached_mode = "dev" elif test: - cached_mode = 'test' + cached_mode = "test" else: - cached_mode = 'train' + cached_mode = "train" assert (evaluate == True and test == True) == False - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( - cached_mode, - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task))) + cached_features_file = os.path.join( + args.data_dir, + "cached_{}_{}_{}_{}".format( + cached_mode, + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + str(task), + ), + ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) @@ -320,8 +363,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): label_list, args.max_seq_length, tokenizer, - pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet - pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0 + pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet + pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) @@ -331,9 +374,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset - all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long) - all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long) - all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long) + all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long) + all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long) + all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) @@ -344,91 +387,150 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--task_name", default=None, type=str, required=True, - help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set') - parser.add_argument("--evaluate_during_training", action='store_true', - help="Run evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument("--do_test", action="store_true", help="Whether to run test on the test set") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") - parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -440,16 +542,24 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -468,17 +578,23 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels, - finetuning_task=args.task_name, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + finetuning_task=args.task_name, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -494,7 +610,6 @@ def main(): global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -504,19 +619,20 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) - # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: @@ -524,17 +640,19 @@ def main(): args.output_dir = args.model_name_or_path checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) if args.do_test and args.local_rank in [-1, 0]: @@ -546,13 +664,13 @@ def main(): # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix, test=True) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) if best_steps: logger.info("best steps of eval acc is the following checkpoints: %s", best_steps) diff --git a/examples/run_ner.py b/examples/run_ner.py index 0fdaacf2aa..48ac61b4fe 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -43,9 +43,12 @@ from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLM logger = logging.getLogger(__name__) ALL_MODELS = sum( - (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig, - CamembertConfig, XLMRobertaConfig)), - ()) + ( + tuple(conf.pretrained_config_archive_map.keys()) + for conf in (BertConfig, RobertaConfig, DistilBertConfig, CamembertConfig, XLMRobertaConfig) + ), + (), +) MODEL_CLASSES = { "bert": (BertConfig, BertForTokenClassification, BertTokenizer), @@ -82,18 +85,24 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], - "weight_decay": args.weight_decay}, - {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0} + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) # Check if saved optimizer or scheduler states exist - if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')): + if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( + os.path.join(args.model_name_or_path, "scheduler.pt") + ): # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt'))) - scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt'))) + optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) + scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: @@ -108,18 +117,21 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * ( - torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -129,7 +141,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path - global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0]) + global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) @@ -140,7 +152,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) + train_iterator = trange( + epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] + ) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) @@ -153,11 +167,11 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {"input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3]} + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": - inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert", "xlnet"] else None + ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) @@ -187,7 +201,9 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) @@ -200,15 +216,17 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) - torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt')) - torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt')) + torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -249,11 +267,11 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix="" batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {"input_ids": batch[0], - "attention_mask": batch[1], - "labels": batch[3]} + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": - inputs["token_type_ids"] = batch[2] if args.model_type in ["bert", "xlnet"] else None # XLM and RoBERTa don"t use segment_ids + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert", "xlnet"] else None + ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -287,7 +305,7 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix="" "loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), - "f1": f1_score(out_label_list, preds_list) + "f1": f1_score(out_label_list, preds_list), } logger.info("***** Eval results %s *****", prefix) @@ -302,29 +320,36 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode, - list(filter(None, args.model_name_or_path.split("/"))).pop(), - str(args.max_seq_length))) + cached_features_file = os.path.join( + args.data_dir, + "cached_{}_{}_{}".format( + mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length) + ), + ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) examples = read_examples_from_file(args.data_dir, mode) - features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer, - cls_token_at_end=bool(args.model_type in ["xlnet"]), - # xlnet has a cls token at the end - cls_token=tokenizer.cls_token, - cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, - sep_token=tokenizer.sep_token, - sep_token_extra=bool(args.model_type in ["roberta"]), - # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 - pad_on_left=bool(args.model_type in ["xlnet"]), - # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, - pad_token_label_id=pad_token_label_id - ) + features = convert_examples_to_features( + examples, + labels, + args.max_seq_length, + tokenizer, + cls_token_at_end=bool(args.model_type in ["xlnet"]), + # xlnet has a cls token at the end + cls_token=tokenizer.cls_token, + cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, + sep_token=tokenizer.sep_token, + sep_token_extra=bool(args.model_type in ["roberta"]), + # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 + pad_on_left=bool(args.model_type in ["xlnet"]), + # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, + pad_token_label_id=pad_token_label_id, + ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) @@ -346,95 +371,151 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--labels", default="", type=str, - help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.") - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--do_train", action="store_true", - help="Whether to run training.") - parser.add_argument("--do_eval", action="store_true", - help="Whether to run eval on the dev set.") - parser.add_argument("--do_predict", action="store_true", - help="Whether to run predictions on the test set.") - parser.add_argument("--evaluate_during_training", action="store_true", - help="Whether to run evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action="store_true", - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--labels", + default="", + type=str, + help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", + ) + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") + parser.add_argument( + "--evaluate_during_training", + action="store_true", + help="Whether to run evaluation during training at each logging step.", + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument("--gradient_accumulation_steps", type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument("--logging_steps", type=int, default=50, - help="Log every X updates steps.") - parser.add_argument("--save_steps", type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action="store_true", - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action="store_true", - help="Avoid using CUDA when available") - parser.add_argument("--overwrite_output_dir", action="store_true", - help="Overwrite the content of the output directory") - parser.add_argument("--overwrite_cache", action="store_true", - help="Overwrite the cached training and evaluation sets") - parser.add_argument("--seed", type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument("--fp16", action="store_true", - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument("--fp16_opt_level", type=str, default="O1", - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir( - args.output_dir) and args.do_train and not args.overwrite_output_dir: + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( - args.output_dir)) + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -451,11 +532,19 @@ def main(): args.device = device # Setup logging - logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -472,16 +561,22 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool(".ckpt" in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -505,7 +600,9 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) @@ -518,7 +615,9 @@ def main(): tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: @@ -565,4 +664,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/examples/run_squad.py b/examples/run_squad.py index 18a5a1c23f..1580a31e85 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -17,7 +17,11 @@ from __future__ import absolute_import, division, print_function from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult -from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate +from transformers.data.metrics.squad_metrics import ( + compute_predictions_logits, + compute_predictions_log_probs, + squad_evaluate, +) import argparse import logging @@ -27,8 +31,7 @@ import glob import timeit import numpy as np import torch -from torch.utils.data import ( - DataLoader, RandomSampler, SequentialSampler, TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler try: @@ -38,32 +41,47 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, BertConfig, - BertForQuestionAnswering, BertTokenizer, - RobertaForQuestionAnswering, RobertaTokenizer, RobertaConfig, - XLMConfig, XLMForQuestionAnswering, - XLMTokenizer, XLNetConfig, - XLNetForQuestionAnswering, - XLNetTokenizer, - DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer, - AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer, - XLMConfig, XLMForQuestionAnswering, XLMTokenizer, - ) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForQuestionAnswering, + BertTokenizer, + RobertaForQuestionAnswering, + RobertaTokenizer, + RobertaConfig, + XLMConfig, + XLMForQuestionAnswering, + XLMTokenizer, + XLNetConfig, + XLNetForQuestionAnswering, + XLNetTokenizer, + DistilBertConfig, + DistilBertForQuestionAnswering, + DistilBertTokenizer, + AlbertConfig, + AlbertForQuestionAnswering, + AlbertTokenizer, + XLMConfig, + XLMForQuestionAnswering, + XLMTokenizer, +) from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ - for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), ()) +ALL_MODELS = sum( + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), + (), +) MODEL_CLASSES = { - 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), - 'roberta': (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer), - 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), + "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer), + "roberta": (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer), + "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), + "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), + "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), + "albert": (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), } @@ -85,49 +103,44 @@ def train(args, train_dataset, model, tokenizer): tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) - train_sampler = RandomSampler( - train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps - args.num_train_epochs = args.max_steps // ( - len(train_dataloader) // args.gradient_accumulation_steps) + 1 + args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: - t_total = len( - train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any( - nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any( - nd in n for nd in no_decay)], 'weight_decay': 0.0} + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] - optimizer = AdamW(optimizer_grouped_parameters, - lr=args.learning_rate, eps=args.adam_epsilon) + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( - optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) # Check if saved optimizer or scheduler states exist - if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')): + if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( + os.path.join(args.model_name_or_path, "scheduler.pt") + ): # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load( - os.path.join(args.model_name_or_path, 'optimizer.pt'))) - scheduler.load_state_dict(torch.load( - os.path.join(args.model_name_or_path, 'scheduler.pt'))) + optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) + scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - model, optimizer = amp.initialize( - model, optimizer, opt_level=args.fp16_opt_level) + model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: @@ -135,20 +148,22 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) - logger.info(" Instantaneous batch size per GPU = %d", - args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) - logger.info(" Gradient Accumulation steps = %d", - args.gradient_accumulation_steps) + logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) + logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 @@ -157,29 +172,25 @@ def train(args, train_dataset, model, tokenizer): # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path - global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0]) - epochs_trained = global_step // (len(train_dataloader) // - args.gradient_accumulation_steps) - steps_trained_in_current_epoch = global_step % ( - len(train_dataloader) // args.gradient_accumulation_steps) + global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) + epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) + steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) - logger.info( - " Continuing training from checkpoint, will skip to saved global_step") + logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) - logger.info(" Will skip the first %d steps in the first epoch", - steps_trained_in_current_epoch) + logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange(epochs_trained, int( - args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) + train_iterator = trange( + epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] + ) # Added here for reproductibility (even between python 2 and 3) set_seed(args) for _ in train_iterator: - epoch_iterator = tqdm(train_dataloader, desc="Iteration", - disable=args.local_rank not in [-1, 0]) + epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training @@ -191,18 +202,17 @@ def train(args, train_dataset, model, tokenizer): batch = tuple(t.to(args.device) for t in batch) inputs = { - 'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2], - 'start_positions': batch[3], - 'end_positions': batch[4], + "input_ids": batch[0], + "attention_mask": batch[1], + "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2], + "start_positions": batch[3], + "end_positions": batch[4], } - if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[5], - 'p_mask': batch[6]}) + if args.model_type in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: - inputs.update({'is_impossible': batch[7]}) + inputs.update({"is_impossible": batch[7]}) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] @@ -221,11 +231,9 @@ def train(args, train_dataset, model, tokenizer): tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: - torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), args.max_grad_norm) + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: - torch.nn.utils.clip_grad_norm_( - model.parameters(), args.max_grad_norm) + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule @@ -238,36 +246,27 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar( - 'eval_{}'.format(key), value, global_step) - tb_writer.add_scalar( - 'lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar( - 'loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + tb_writer.add_scalar("eval_{}".format(key), value, global_step) + tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) + tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - output_dir = os.path.join( - args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training - model_to_save = model.module if hasattr( - model, 'module') else model + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - torch.save(args, os.path.join( - output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) - torch.save(optimizer.state_dict(), os.path.join( - output_dir, 'optimizer.pt')) - torch.save(scheduler.state_dict(), os.path.join( - output_dir, 'scheduler.pt')) - logger.info( - "Saving optimizer and scheduler states to %s", output_dir) + torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) + logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() @@ -283,8 +282,7 @@ def train(args, train_dataset, model, tokenizer): def evaluate(args, model, tokenizer, prefix=""): - dataset, examples, features = load_and_cache_examples( - args, tokenizer, evaluate=True, output_examples=True) + dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) @@ -293,8 +291,7 @@ def evaluate(args, model, tokenizer, prefix=""): # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) - eval_dataloader = DataLoader( - dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): @@ -314,15 +311,15 @@ def evaluate(args, model, tokenizer, prefix=""): with torch.no_grad(): inputs = { - 'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2], + "input_ids": batch[0], + "attention_mask": batch[1], + "token_type_ids": None if args.model_type in ["xlm", "roberta", "distilbert"] else batch[2], } example_indices = batch[3] # XLNet and XLM use more arguments for their predictions - if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) + if args.model_type in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) @@ -342,53 +339,68 @@ def evaluate(args, model, tokenizer, prefix=""): cls_logits = output[4] result = SquadResult( - unique_id, start_logits, end_logits, + unique_id, + start_logits, + end_logits, start_top_index=start_top_index, end_top_index=end_top_index, - cls_logits=cls_logits + cls_logits=cls_logits, ) else: start_logits, end_logits = output - result = SquadResult( - unique_id, start_logits, end_logits - ) + result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time - logger.info(" Evaluation done in total %f secs (%f sec per example)", - evalTime, evalTime / len(dataset)) + logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset)) # Compute predictions - output_prediction_file = os.path.join( - args.output_dir, "predictions_{}.json".format(prefix)) - output_nbest_file = os.path.join( - args.output_dir, "nbest_predictions_{}.json".format(prefix)) + output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) + output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) if args.version_2_with_negative: - output_null_log_odds_file = os.path.join( - args.output_dir, "null_odds_{}.json".format(prefix)) + output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None # XLNet and XLM use a more complex post-processing procedure - if args.model_type in ['xlnet', 'xlm']: - start_n_top = model.config.start_n_top if hasattr( - model, "config") else model.module.config.start_n_top - end_n_top = model.config.end_n_top if hasattr( - model, "config") else model.module.config.end_n_top + if args.model_type in ["xlnet", "xlm"]: + start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top + end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top - predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size, - args.max_answer_length, output_prediction_file, - output_nbest_file, output_null_log_odds_file, - start_n_top, end_n_top, - args.version_2_with_negative, tokenizer, args.verbose_logging) + predictions = compute_predictions_log_probs( + examples, + features, + all_results, + args.n_best_size, + args.max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + start_n_top, + end_n_top, + args.version_2_with_negative, + tokenizer, + args.verbose_logging, + ) else: - predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size, - args.max_answer_length, args.do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold, tokenizer) + predictions = compute_predictions_logits( + examples, + features, + all_results, + args.n_best_size, + args.max_answer_length, + args.do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + args.verbose_logging, + args.version_2_with_negative, + args.null_score_diff_threshold, + tokenizer, + ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) @@ -402,16 +414,18 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." - cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length)) + cached_features_file = os.path.join( + input_dir, + "cached_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: - logger.info("Loading features from cached file %s", - cached_features_file) + logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset = features_and_dataset["features"], features_and_dataset["dataset"] else: @@ -421,16 +435,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal try: import tensorflow_datasets as tfds except ImportError: - raise ImportError( - "If not data_dir is specified, tensorflow_datasets needs to be installed.") + raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: - logger.warn( - "tensorflow_datasets does not handle version 2 of SQuAD.") + logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") - examples = SquadV1Processor().get_examples_from_dataset( - tfds_examples, evaluate=evaluate) + examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: @@ -445,15 +456,13 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, - return_dataset='pt', + return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: - logger.info("Saving features into cached file %s", - cached_features_file) - torch.save({"features": features, "dataset": dataset}, - cached_features_file) + logger.info("Saving features into cached file %s", cached_features_file) + torch.save({"features": features, "dataset": dataset}, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache @@ -468,140 +477,232 @@ def main(): parser = argparse.ArgumentParser() # Required parameters - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model checkpoints and predictions will be written.") + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints and predictions will be written.", + ) # Other parameters - parser.add_argument("--data_dir", default=None, type=str, - help="The input data dir. Should contain the .json files for the task." + - "If no data dir or train/predict files are specified, will run with tensorflow_datasets.") - parser.add_argument("--train_file", default=None, type=str, - help="The input training file. If a data dir is specified, will look for the file there" + - "If no data dir or train/predict files are specified, will run with tensorflow_datasets.") - parser.add_argument("--predict_file", default=None, type=str, - help="The input evaluation file. If a data dir is specified, will look for the file there" + - "If no data dir or train/predict files are specified, will run with tensorflow_datasets.") - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") + parser.add_argument( + "--data_dir", + default=None, + type=str, + help="The input data dir. Should contain the .json files for the task." + + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", + ) + parser.add_argument( + "--train_file", + default=None, + type=str, + help="The input training file. If a data dir is specified, will look for the file there" + + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", + ) + parser.add_argument( + "--predict_file", + default=None, + type=str, + help="The input evaluation file. If a data dir is specified, will look for the file there" + + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", + ) + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) - parser.add_argument('--version_2_with_negative', action='store_true', - help='If true, the SQuAD examples contain some that do not have an answer.') - parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, - help="If null_score - best_non_null is greater than the threshold predict null.") + parser.add_argument( + "--version_2_with_negative", + action="store_true", + help="If true, the SQuAD examples contain some that do not have an answer.", + ) + parser.add_argument( + "--null_score_diff_threshold", + type=float, + default=0.0, + help="If null_score - best_non_null is greater than the threshold predict null.", + ) - parser.add_argument("--max_seq_length", default=384, type=int, - help="The maximum total input sequence length after WordPiece tokenization. Sequences " - "longer than this will be truncated, and sequences shorter than this will be padded.") - parser.add_argument("--doc_stride", default=128, type=int, - help="When splitting up a long document into chunks, how much stride to take between chunks.") - parser.add_argument("--max_query_length", default=64, type=int, - help="The maximum number of tokens for the question. Questions longer than this will " - "be truncated to this length.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--max_seq_length", + default=384, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. Sequences " + "longer than this will be truncated, and sequences shorter than this will be padded.", + ) + parser.add_argument( + "--doc_stride", + default=128, + type=int, + help="When splitting up a long document into chunks, how much stride to take between chunks.", + ) + parser.add_argument( + "--max_query_length", + default=64, + type=int, + help="The maximum number of tokens for the question. Questions longer than this will " + "be truncated to this length.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") - parser.add_argument("--n_best_size", default=20, type=int, - help="The total number of n-best predictions to generate in the nbest_predictions.json output file.") - parser.add_argument("--max_answer_length", default=30, type=int, - help="The maximum length of an answer that can be generated. This is needed because the start " - "and end predictions are not conditioned on one another.") - parser.add_argument("--verbose_logging", action='store_true', - help="If true, all of the warnings related to data processing will be printed. " - "A number of warnings are expected for a normal SQuAD evaluation.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") + parser.add_argument( + "--n_best_size", + default=20, + type=int, + help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", + ) + parser.add_argument( + "--max_answer_length", + default=30, + type=int, + help="The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another.", + ) + parser.add_argument( + "--verbose_logging", + action="store_true", + help="If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.", + ) - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument("--local_rank", type=int, default=-1, - help="local_rank for distributed training on gpus") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") - parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features') + parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): raise ValueError( - "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") - ptvsd.enable_attach( - address=(args.server_ip, args.server_port), redirect_output=True) + ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: - device = torch.device( - "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -613,16 +714,21 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool( - '.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab @@ -638,18 +744,16 @@ def main(): if args.fp16: try: import apex - apex.amp.register_half_function(torch, 'einsum') + + apex.amp.register_half_function(torch, "einsum") except ImportError: - raise ImportError( - "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Training if args.do_train: - train_dataset = load_and_cache_examples( - args, tokenizer, evaluate=False, output_examples=False) + train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) - logger.info(" global_step = %s, average loss = %s", - global_step, tr_loss) + logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): @@ -661,18 +765,16 @@ def main(): # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training - model_to_save = model.module if hasattr(model, 'module') else model + model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned - model = model_class.from_pretrained( - args.output_dir, force_download=True) - tokenizer = tokenizer_class.from_pretrained( - args.output_dir, do_lower_case=args.do_lower_case) + model = model_class.from_pretrained(args.output_dir, force_download=True) + tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory @@ -682,7 +784,10 @@ def main(): logger.info("Loading checkpoints saved during training for evaluation") checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) + for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs else: logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path) @@ -692,17 +797,14 @@ def main(): for checkpoint in checkpoints: # Reload the model - global_step = checkpoint.split( - '-')[-1] if len(checkpoints) > 1 else "" - model = model_class.from_pretrained( - checkpoint, force_download=True) + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + model = model_class.from_pretrained(checkpoint, force_download=True) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) - result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) - for k, v in result.items()) + result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py index 54282277d2..74a6db34ad 100644 --- a/examples/run_tf_glue.py +++ b/examples/run_tf_glue.py @@ -1,7 +1,14 @@ import os import tensorflow as tf import tensorflow_datasets -from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors +from transformers import ( + BertTokenizer, + TFBertForSequenceClassification, + BertConfig, + glue_convert_examples_to_features, + BertForSequenceClassification, + glue_processors, +) # script parameters BATCH_SIZE = 32 @@ -16,7 +23,7 @@ if TASK == "sst-2": TFDS_TASK = "sst2" elif TASK == "sts-b": TFDS_TASK = "stsb" -else: +else: TFDS_TASK = TASK num_labels = len(glue_processors[TASK]().get_labels()) @@ -27,29 +34,29 @@ tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression) config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels) -tokenizer = BertTokenizer.from_pretrained('bert-base-cased') -model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config) +tokenizer = BertTokenizer.from_pretrained("bert-base-cased") +model = TFBertForSequenceClassification.from_pretrained("bert-base-cased", config=config) # Load dataset via TensorFlow Datasets -data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True) -train_examples = info.splits['train'].num_examples +data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True) +train_examples = info.splits["train"].num_examples # MNLI expects either validation_matched or validation_mismatched -valid_examples = info.splits['validation'].num_examples +valid_examples = info.splits["validation"].num_examples # Prepare dataset for GLUE as a tf.data.Dataset instance -train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK) +train_dataset = glue_convert_examples_to_features(data["train"], tokenizer, 128, TASK) # MNLI expects either validation_matched or validation_mismatched -valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK) +valid_dataset = glue_convert_examples_to_features(data["validation"], tokenizer, 128, TASK) train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE) -# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule +# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) if USE_AMP: # loss scaling is currently required when using mixed precision - opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') + opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic") if num_labels == 1: @@ -57,37 +64,42 @@ if num_labels == 1: else: loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) -metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') +metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=opt, loss=loss, metrics=[metric]) # Train and evaluate using tf.keras.Model.fit() -train_steps = train_examples//BATCH_SIZE -valid_steps = valid_examples//EVAL_BATCH_SIZE +train_steps = train_examples // BATCH_SIZE +valid_steps = valid_examples // EVAL_BATCH_SIZE -history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps, - validation_data=valid_dataset, validation_steps=valid_steps) +history = model.fit( + train_dataset, + epochs=EPOCHS, + steps_per_epoch=train_steps, + validation_data=valid_dataset, + validation_steps=valid_steps, +) # Save TF2 model -os.makedirs('./save/', exist_ok=True) -model.save_pretrained('./save/') +os.makedirs("./save/", exist_ok=True) +model.save_pretrained("./save/") if TASK == "mrpc": # Load the TensorFlow model in PyTorch for inspection - # This is to demo the interoperability between the two frameworks, you don't have to + # This is to demo the interoperability between the two frameworks, you don't have to # do this in real life (you can run the inference on the TF model). - pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True) + pytorch_model = BertForSequenceClassification.from_pretrained("./save/", from_tf=True) # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task - sentence_0 = 'This research was consistent with his findings.' - sentence_1 = 'His findings were compatible with this research.' - sentence_2 = 'His findings were not compatible with this research.' - inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') - inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') + sentence_0 = "This research was consistent with his findings." + sentence_1 = "His findings were compatible with this research." + sentence_2 = "His findings were not compatible with this research." + inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors="pt") + inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors="pt") del inputs_1["special_tokens_mask"] del inputs_2["special_tokens_mask"] pred_1 = pytorch_model(**inputs_1)[0].argmax().item() pred_2 = pytorch_model(**inputs_2)[0].argmax().item() - print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0') - print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0') + print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0") + print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0") diff --git a/examples/run_tf_ner.py b/examples/run_tf_ner.py index eb284f4c2a..77850d1ab5 100644 --- a/examples/run_tf_ner.py +++ b/examples/run_tf_ner.py @@ -21,189 +21,156 @@ from absl import app ALL_MODELS = sum( - (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), - ()) + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), () +) MODEL_CLASSES = { "bert": (BertConfig, TFBertForTokenClassification, BertTokenizer), "roberta": (RobertaConfig, TFRobertaForTokenClassification, RobertaTokenizer), - "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer) + "distilbert": (DistilBertConfig, TFDistilBertForTokenClassification, DistilBertTokenizer), } flags.DEFINE_string( - "data_dir", None, - "The input data dir. Should contain the .conll files (or other data files) " - "for the task.") + "data_dir", None, "The input data dir. Should contain the .conll files (or other data files) " "for the task." +) + +flags.DEFINE_string("model_type", None, "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) flags.DEFINE_string( - "model_type", None, - "Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + "model_name_or_path", + None, + "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), +) + +flags.DEFINE_string("output_dir", None, "The output directory where the model checkpoints will be written.") flags.DEFINE_string( - "model_name_or_path", None, - "Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) + "labels", "", "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used." +) -flags.DEFINE_string( - "output_dir", None, - "The output directory where the model checkpoints will be written.") +flags.DEFINE_string("config_name", "", "Pretrained config name or path if not the same as model_name") -flags.DEFINE_string( - "labels", "", - "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.") +flags.DEFINE_string("tokenizer_name", "", "Pretrained tokenizer name or path if not the same as model_name") -flags.DEFINE_string( - "config_name", "", - "Pretrained config name or path if not the same as model_name") - -flags.DEFINE_string( - "tokenizer_name", "", - "Pretrained tokenizer name or path if not the same as model_name") - -flags.DEFINE_string( - "cache_dir", "", - "Where do you want to store the pre-trained models downloaded from s3") +flags.DEFINE_string("cache_dir", "", "Where do you want to store the pre-trained models downloaded from s3") flags.DEFINE_integer( - "max_seq_length", 128, + "max_seq_length", + 128, "The maximum total input sentence length after tokenization. " "Sequences longer than this will be truncated, sequences shorter " - "will be padded.") + "will be padded.", +) flags.DEFINE_string( - "tpu", None, + "tpu", + None, "The Cloud TPU to use for training. This should be either the name " "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " - "url.") + "url.", +) -flags.DEFINE_integer( - "num_tpu_cores", 8, - "Total number of TPU cores to use.") +flags.DEFINE_integer("num_tpu_cores", 8, "Total number of TPU cores to use.") + +flags.DEFINE_boolean("do_train", False, "Whether to run training.") + +flags.DEFINE_boolean("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_boolean("do_predict", False, "Whether to run predictions on the test set.") flags.DEFINE_boolean( - "do_train", False, - "Whether to run training.") + "evaluate_during_training", False, "Whether to run evaluation during training at each logging step." +) -flags.DEFINE_boolean( - "do_eval", False, - "Whether to run eval on the dev set.") +flags.DEFINE_boolean("do_lower_case", False, "Set this flag if you are using an uncased model.") -flags.DEFINE_boolean( - "do_predict", False, - "Whether to run predictions on the test set.") +flags.DEFINE_integer("per_device_train_batch_size", 8, "Batch size per GPU/CPU/TPU for training.") -flags.DEFINE_boolean( - "evaluate_during_training", False, - "Whether to run evaluation during training at each logging step.") - -flags.DEFINE_boolean( - "do_lower_case", False, - "Set this flag if you are using an uncased model.") +flags.DEFINE_integer("per_device_eval_batch_size", 8, "Batch size per GPU/CPU/TPU for evaluation.") flags.DEFINE_integer( - "per_device_train_batch_size", 8, - "Batch size per GPU/CPU/TPU for training.") + "gradient_accumulation_steps", 1, "Number of updates steps to accumulate before performing a backward/update pass." +) + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("weight_decay", 0.0, "Weight decay if we apply some.") + +flags.DEFINE_float("adam_epsilon", 1e-8, "Epsilon for Adam optimizer.") + +flags.DEFINE_float("max_grad_norm", 1.0, "Max gradient norm.") + +flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.") flags.DEFINE_integer( - "per_device_eval_batch_size", 8, - "Batch size per GPU/CPU/TPU for evaluation.") + "max_steps", -1, "If > 0: set total number of training steps to perform. Override num_train_epochs." +) -flags.DEFINE_integer( - "gradient_accumulation_steps", 1, - "Number of updates steps to accumulate before performing a backward/update pass.") +flags.DEFINE_integer("warmup_steps", 0, "Linear warmup over warmup_steps.") -flags.DEFINE_float( - "learning_rate", 5e-5, - "The initial learning rate for Adam.") +flags.DEFINE_integer("logging_steps", 50, "Log every X updates steps.") -flags.DEFINE_float( - "weight_decay", 0.0, - "Weight decay if we apply some.") - -flags.DEFINE_float( - "adam_epsilon", 1e-8, - "Epsilon for Adam optimizer.") - -flags.DEFINE_float( - "max_grad_norm", 1.0, - "Max gradient norm.") - -flags.DEFINE_integer( - "num_train_epochs", 3, - "Total number of training epochs to perform.") - -flags.DEFINE_integer( - "max_steps", -1, - "If > 0: set total number of training steps to perform. Override num_train_epochs.") - -flags.DEFINE_integer( - "warmup_steps", 0, - "Linear warmup over warmup_steps.") - -flags.DEFINE_integer( - "logging_steps", 50, - "Log every X updates steps.") - -flags.DEFINE_integer( - "save_steps", 50, - "Save checkpoint every X updates steps.") +flags.DEFINE_integer("save_steps", 50, "Save checkpoint every X updates steps.") flags.DEFINE_boolean( - "eval_all_checkpoints", False, - "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") + "eval_all_checkpoints", + False, + "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", +) -flags.DEFINE_boolean( - "no_cuda", False, - "Avoid using CUDA when available") +flags.DEFINE_boolean("no_cuda", False, "Avoid using CUDA when available") -flags.DEFINE_boolean( - "overwrite_output_dir", False, - "Overwrite the content of the output directory") +flags.DEFINE_boolean("overwrite_output_dir", False, "Overwrite the content of the output directory") -flags.DEFINE_boolean( - "overwrite_cache", False, - "Overwrite the cached training and evaluation sets") +flags.DEFINE_boolean("overwrite_cache", False, "Overwrite the cached training and evaluation sets") -flags.DEFINE_integer( - "seed", 42, - "random seed for initialization") +flags.DEFINE_integer("seed", 42, "random seed for initialization") -flags.DEFINE_boolean( - "fp16", False, - "Whether to use 16-bit (mixed) precision instead of 32-bit") +flags.DEFINE_boolean("fp16", False, "Whether to use 16-bit (mixed) precision instead of 32-bit") flags.DEFINE_string( - "gpus", "0", + "gpus", + "0", "Comma separated list of gpus devices. If only one, switch to single " - "gpu strategy, if None takes all the gpus available.") + "gpu strategy, if None takes all the gpus available.", +) -def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id): - if args['max_steps'] > 0: - num_train_steps = args['max_steps'] * args['gradient_accumulation_steps'] - args['num_train_epochs'] = 1 +def train( + args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id +): + if args["max_steps"] > 0: + num_train_steps = args["max_steps"] * args["gradient_accumulation_steps"] + args["num_train_epochs"] = 1 else: - num_train_steps = math.ceil(num_train_examples / train_batch_size) // args['gradient_accumulation_steps'] * args['num_train_epochs'] + num_train_steps = ( + math.ceil(num_train_examples / train_batch_size) + // args["gradient_accumulation_steps"] + * args["num_train_epochs"] + ) writer = tf.summary.create_file_writer("/tmp/mylogs") with strategy.scope(): loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) - optimizer = create_optimizer(args['learning_rate'], num_train_steps, args['warmup_steps']) + optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"]) - if args['fp16']: - optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, 'dynamic') + if args["fp16"]: + optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic") - loss_metric = tf.keras.metrics.Mean(name='loss', dtype=tf.float32) + loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32) gradient_accumulator = GradientAccumulator() - + logging.info("***** Running training *****") logging.info(" Num examples = %d", num_train_examples) - logging.info(" Num Epochs = %d", args['num_train_epochs']) - logging.info(" Instantaneous batch size per device = %d", args['per_device_train_batch_size']) - logging.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - train_batch_size * args['gradient_accumulation_steps']) - logging.info(" Gradient Accumulation steps = %d", args['gradient_accumulation_steps']) + logging.info(" Num Epochs = %d", args["num_train_epochs"]) + logging.info(" Instantaneous batch size per device = %d", args["per_device_train_batch_size"]) + logging.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + train_batch_size * args["gradient_accumulation_steps"], + ) + logging.info(" Gradient Accumulation steps = %d", args["gradient_accumulation_steps"]) logging.info(" Total training steps = %d", num_train_steps) model.summary() @@ -214,26 +181,28 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables): if gradient is not None: - scaled_gradient = gradient / (args['n_device'] * args['gradient_accumulation_steps']) + scaled_gradient = gradient / (args["n_device"] * args["gradient_accumulation_steps"]) grads_and_vars.append((scaled_gradient, variable)) else: grads_and_vars.append((gradient, variable)) - optimizer.apply_gradients(grads_and_vars, args['max_grad_norm']) + optimizer.apply_gradients(grads_and_vars, args["max_grad_norm"]) gradient_accumulator.reset() @tf.function def train_step(train_features, train_labels): def step_fn(train_features, train_labels): - inputs = {'attention_mask': train_features['input_mask'], 'training': True} + inputs = {"attention_mask": train_features["input_mask"], "training": True} - if args['model_type'] != "distilbert": - inputs["token_type_ids"] = train_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None + if args["model_type"] != "distilbert": + inputs["token_type_ids"] = ( + train_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None + ) with tf.GradientTape() as tape: - logits = model(train_features['input_ids'], **inputs)[0] + logits = model(train_features["input_ids"], **inputs)[0] logits = tf.reshape(logits, (-1, len(labels) + 1)) - active_loss = tf.reshape(train_features['input_mask'], (-1,)) + active_loss = tf.reshape(train_features["input_mask"], (-1,)) active_logits = tf.boolean_mask(logits, active_loss) train_labels = tf.reshape(train_labels, (-1,)) active_labels = tf.boolean_mask(train_labels, active_loss) @@ -251,34 +220,40 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l return mean_loss current_time = datetime.datetime.now() - train_iterator = master_bar(range(args['num_train_epochs'])) + train_iterator = master_bar(range(args["num_train_epochs"])) global_step = 0 logging_loss = 0.0 for epoch in train_iterator: - epoch_iterator = progress_bar(train_dataset, total=num_train_steps, parent=train_iterator, display=args['n_device'] > 1) + epoch_iterator = progress_bar( + train_dataset, total=num_train_steps, parent=train_iterator, display=args["n_device"] > 1 + ) step = 1 with strategy.scope(): for train_features, train_labels in epoch_iterator: loss = train_step(train_features, train_labels) - if step % args['gradient_accumulation_steps'] == 0: + if step % args["gradient_accumulation_steps"] == 0: strategy.experimental_run_v2(apply_gradients) loss_metric(loss) global_step += 1 - if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0: + if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics - if args['n_device'] == 1 and args['evaluate_during_training']: # Only evaluate when single GPU otherwise metrics may not average well - y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev") + if ( + args["n_device"] == 1 and args["evaluate_during_training"] + ): # Only evaluate when single GPU otherwise metrics may not average well + y_true, y_pred, eval_loss = evaluate( + args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev" + ) report = metrics.classification_report(y_true, y_pred, digits=4) - + logging.info("Eval at step " + str(global_step) + "\n" + report) logging.info("eval_loss: " + str(eval_loss)) - + precision = metrics.precision_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) f1 = metrics.f1_score(y_true, y_pred) @@ -288,33 +263,35 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l tf.summary.scalar("precision", precision, global_step) tf.summary.scalar("recall", recall, global_step) tf.summary.scalar("f1", f1, global_step) - + lr = optimizer.learning_rate learning_rate = lr(step) with writer.as_default(): tf.summary.scalar("lr", learning_rate, global_step) - tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / args['logging_steps'], global_step) - + tf.summary.scalar( + "loss", (loss_metric.result() - logging_loss) / args["logging_steps"], global_step + ) + logging_loss = loss_metric.result() with writer.as_default(): tf.summary.scalar("loss", loss_metric.result(), step=step) - if args['save_steps'] > 0 and global_step % args['save_steps'] == 0: + if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint - output_dir = os.path.join(args['output_dir'], "checkpoint-{}".format(global_step)) + output_dir = os.path.join(args["output_dir"], "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - + model.save_pretrained(output_dir) logging.info("Saving model checkpoint to %s", output_dir) - - train_iterator.child.comment = f'loss : {loss_metric.result()}' + + train_iterator.child.comment = f"loss : {loss_metric.result()}" step += 1 - train_iterator.write(f'loss epoch {epoch + 1}: {loss_metric.result()}') + train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}") loss_metric.reset_states() @@ -322,13 +299,15 @@ def train(args, strategy, train_dataset, tokenizer, model, num_train_examples, l def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode): - eval_batch_size = args['per_device_eval_batch_size'] * args['n_device'] - eval_dataset, size = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode) + eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"] + eval_dataset, size = load_and_cache_examples( + args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode=mode + ) eval_dataset = strategy.experimental_distribute_dataset(eval_dataset) preds = None num_eval_steps = math.ceil(size / eval_batch_size) master = master_bar(range(1)) - eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args['n_device'] > 1) + eval_iterator = progress_bar(eval_dataset, total=num_eval_steps, parent=master, display=args["n_device"] > 1) loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE) loss = 0.0 @@ -337,15 +316,17 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode) logging.info(" Batch size = %d", eval_batch_size) for eval_features, eval_labels in eval_iterator: - inputs = {'attention_mask': eval_features['input_mask'], 'training': False} + inputs = {"attention_mask": eval_features["input_mask"], "training": False} - if args['model_type'] != "distilbert": - inputs["token_type_ids"] = eval_features['segment_ids'] if args['model_type'] in ["bert", "xlnet"] else None + if args["model_type"] != "distilbert": + inputs["token_type_ids"] = ( + eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None + ) with strategy.scope(): - logits = model(eval_features['input_ids'], **inputs)[0] + logits = model(eval_features["input_ids"], **inputs)[0] tmp_logits = tf.reshape(logits, (-1, len(labels) + 1)) - active_loss = tf.reshape(eval_features['input_mask'], (-1,)) + active_loss = tf.reshape(eval_features["input_mask"], (-1,)) active_logits = tf.boolean_mask(tmp_logits, active_loss) tmp_eval_labels = tf.reshape(eval_labels, (-1,)) active_labels = tf.boolean_mask(tmp_eval_labels, active_loss) @@ -384,11 +365,11 @@ def load_cache(cached_file, max_seq_length): def _decode_record(record): example = tf.io.parse_single_example(record, name_to_features) features = {} - features['input_ids'] = example['input_ids'] - features['input_mask'] = example['input_mask'] - features['segment_ids'] = example['segment_ids'] + features["input_ids"] = example["input_ids"] + features["input_mask"] = example["input_mask"] + features["segment_ids"] = example["segment_ids"] - return features, example['label_ids'] + return features, example["label_ids"] d = tf.data.TFRecordDataset(cached_file) d = d.map(_decode_record, num_parallel_calls=4) @@ -422,39 +403,46 @@ def save_cache(features, cached_features_file): def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_size, mode): - drop_remainder = True if args['tpu'] or mode == 'train' else False + drop_remainder = True if args["tpu"] or mode == "train" else False # Load data features from cache or dataset file - cached_features_file = os.path.join(args['data_dir'], "cached_{}_{}_{}.tf_record".format(mode, - list(filter(None, args['model_name_or_path'].split("/"))).pop(), - str(args['max_seq_length']))) - if os.path.exists(cached_features_file) and not args['overwrite_cache']: + cached_features_file = os.path.join( + args["data_dir"], + "cached_{}_{}_{}.tf_record".format( + mode, list(filter(None, args["model_name_or_path"].split("/"))).pop(), str(args["max_seq_length"]) + ), + ) + if os.path.exists(cached_features_file) and not args["overwrite_cache"]: logging.info("Loading features from cached file %s", cached_features_file) - dataset, size = load_cache(cached_features_file, args['max_seq_length']) + dataset, size = load_cache(cached_features_file, args["max_seq_length"]) else: - logging.info("Creating features from dataset file at %s", args['data_dir']) - examples = read_examples_from_file(args['data_dir'], mode) - features = convert_examples_to_features(examples, labels, args['max_seq_length'], tokenizer, - cls_token_at_end=bool(args['model_type'] in ["xlnet"]), - # xlnet has a cls token at the end - cls_token=tokenizer.cls_token, - cls_token_segment_id=2 if args['model_type'] in ["xlnet"] else 0, - sep_token=tokenizer.sep_token, - sep_token_extra=bool(args['model_type'] in ["roberta"]), - # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 - pad_on_left=bool(args['model_type'] in ["xlnet"]), - # pad on the left for xlnet - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=4 if args['model_type'] in ["xlnet"] else 0, - pad_token_label_id=pad_token_label_id - ) + logging.info("Creating features from dataset file at %s", args["data_dir"]) + examples = read_examples_from_file(args["data_dir"], mode) + features = convert_examples_to_features( + examples, + labels, + args["max_seq_length"], + tokenizer, + cls_token_at_end=bool(args["model_type"] in ["xlnet"]), + # xlnet has a cls token at the end + cls_token=tokenizer.cls_token, + cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, + sep_token=tokenizer.sep_token, + sep_token_extra=bool(args["model_type"] in ["roberta"]), + # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 + pad_on_left=bool(args["model_type"] in ["xlnet"]), + # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, + pad_token_label_id=pad_token_label_id, + ) logging.info("Saving features into cached file %s", cached_features_file) save_cache(features, cached_features_file) - dataset, size = load_cache(cached_features_file, args['max_seq_length']) + dataset, size = load_cache(cached_features_file, args["max_seq_length"]) - if mode == 'train': + if mode == "train": dataset = dataset.repeat() - dataset = dataset.shuffle(buffer_size=8192, seed=args['seed']) + dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"]) dataset = dataset.batch(batch_size, drop_remainder) dataset = dataset.prefetch(buffer_size=batch_size) @@ -466,98 +454,134 @@ def main(_): logging.set_verbosity(logging.INFO) args = flags.FLAGS.flag_values_dict() - if os.path.exists(args['output_dir']) and os.listdir( - args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']: + if ( + os.path.exists(args["output_dir"]) + and os.listdir(args["output_dir"]) + and args["do_train"] + and not args["overwrite_output_dir"] + ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( - args['output_dir'])) + args["output_dir"] + ) + ) - if args['fp16']: + if args["fp16"]: tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) - if args['tpu']: - resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args['tpu']) + if args["tpu"]: + resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"]) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) - args['n_device'] = args['num_tpu_cores'] - elif len(args['gpus'].split(',')) > 1: - args['n_device'] = len([f"/gpu:{gpu}" for gpu in args['gpus'].split(',')]) - strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args['gpus'].split(',')]) - elif args['no_cuda']: - args['n_device'] = 1 + args["n_device"] = args["num_tpu_cores"] + elif len(args["gpus"].split(",")) > 1: + args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) + strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) + elif args["no_cuda"]: + args["n_device"] = 1 strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: - args['n_device'] = len(args['gpus'].split(',')) - strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args['gpus'].split(',')[0]) + args["n_device"] = len(args["gpus"].split(",")) + strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0]) - logging.warning("n_device: %s, distributed training: %s, 16-bits training: %s", - args['n_device'], bool(args['n_device'] > 1), args['fp16']) + logging.warning( + "n_device: %s, distributed training: %s, 16-bits training: %s", + args["n_device"], + bool(args["n_device"] > 1), + args["fp16"], + ) - labels = get_labels(args['labels']) + labels = get_labels(args["labels"]) num_labels = len(labels) + 1 pad_token_label_id = 0 - config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']] - config = config_class.from_pretrained(args['config_name'] if args['config_name'] else args['model_name_or_path'], - num_labels=num_labels, - cache_dir=args['cache_dir'] if args['cache_dir'] else None) + config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]] + config = config_class.from_pretrained( + args["config_name"] if args["config_name"] else args["model_name_or_path"], + num_labels=num_labels, + cache_dir=args["cache_dir"] if args["cache_dir"] else None, + ) logging.info("Training/evaluation parameters %s", args) # Training - if args['do_train']: - tokenizer = tokenizer_class.from_pretrained(args['tokenizer_name'] if args['tokenizer_name'] else args['model_name_or_path'], - do_lower_case=args['do_lower_case'], - cache_dir=args['cache_dir'] if args['cache_dir'] else None) + if args["do_train"]: + tokenizer = tokenizer_class.from_pretrained( + args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"], + do_lower_case=args["do_lower_case"], + cache_dir=args["cache_dir"] if args["cache_dir"] else None, + ) with strategy.scope(): - model = model_class.from_pretrained(args['model_name_or_path'], - from_pt=bool(".bin" in args['model_name_or_path']), - config=config, - cache_dir=args['cache_dir'] if args['cache_dir'] else None) + model = model_class.from_pretrained( + args["model_name_or_path"], + from_pt=bool(".bin" in args["model_name_or_path"]), + config=config, + cache_dir=args["cache_dir"] if args["cache_dir"] else None, + ) model.layers[-1].activation = tf.keras.activations.softmax - train_batch_size = args['per_device_train_batch_size'] * args['n_device'] - train_dataset, num_train_examples = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train") + train_batch_size = args["per_device_train_batch_size"] * args["n_device"] + train_dataset, num_train_examples = load_and_cache_examples( + args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train" + ) train_dataset = strategy.experimental_distribute_dataset(train_dataset) - train(args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id) + train( + args, + strategy, + train_dataset, + tokenizer, + model, + num_train_examples, + labels, + train_batch_size, + pad_token_label_id, + ) - if not os.path.exists(args['output_dir']): - os.makedirs(args['output_dir']) + if not os.path.exists(args["output_dir"]): + os.makedirs(args["output_dir"]) - logging.info("Saving model to %s", args['output_dir']) + logging.info("Saving model to %s", args["output_dir"]) - model.save_pretrained(args['output_dir']) - tokenizer.save_pretrained(args['output_dir']) + model.save_pretrained(args["output_dir"]) + tokenizer.save_pretrained(args["output_dir"]) # Evaluation - if args['do_eval']: - tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case']) + if args["do_eval"]: + tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"]) checkpoints = [] results = [] - if args['eval_all_checkpoints']: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args['output_dir'] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int(''.join(filter(str.isdigit, f)) or -1))) - + if args["eval_all_checkpoints"]: + checkpoints = list( + os.path.dirname(c) + for c in sorted( + glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), + key=lambda f: int("".join(filter(str.isdigit, f)) or -1), + ) + ) + logging.info("Evaluate the following checkpoints: %s", checkpoints) if len(checkpoints) == 0: - checkpoints.append(args['output_dir']) - + checkpoints.append(args["output_dir"]) + for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final" with strategy.scope(): model = model_class.from_pretrained(checkpoint) - y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev") + y_true, y_pred, eval_loss = evaluate( + args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev" + ) report = metrics.classification_report(y_true, y_pred, digits=4) if global_step: results.append({global_step + "_report": report, global_step + "_loss": eval_loss}) - output_eval_file = os.path.join(args['output_dir'], "eval_results.txt") - + output_eval_file = os.path.join(args["output_dir"], "eval_results.txt") + with tf.io.gfile.GFile(output_eval_file, "w") as writer: for res in results: for key, val in res.items(): @@ -572,26 +596,28 @@ def main(_): writer.write(report) writer.write("\n") - if args['do_predict']: - tokenizer = tokenizer_class.from_pretrained(args['output_dir'], do_lower_case=args['do_lower_case']) - model = model_class.from_pretrained(args['output_dir']) - eval_batch_size = args['per_device_eval_batch_size'] * args['n_device'] - predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test") + if args["do_predict"]: + tokenizer = tokenizer_class.from_pretrained(args["output_dir"], do_lower_case=args["do_lower_case"]) + model = model_class.from_pretrained(args["output_dir"]) + eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"] + predict_dataset, _ = load_and_cache_examples( + args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test" + ) y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test") - output_test_results_file = os.path.join(args['output_dir'], "test_results.txt") - output_test_predictions_file = os.path.join(args['output_dir'], "test_predictions.txt") + output_test_results_file = os.path.join(args["output_dir"], "test_results.txt") + output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt") report = metrics.classification_report(y_true, y_pred, digits=4) with tf.io.gfile.GFile(output_test_results_file, "w") as writer: report = metrics.classification_report(y_true, y_pred, digits=4) - + logging.info("\n" + report) - + writer.write(report) writer.write("\n\nloss = " + str(pred_loss)) with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer: - with tf.io.gfile.GFile(os.path.join(args['data_dir'], "test.txt"), "r") as f: + with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f: example_id = 0 for line in f: diff --git a/examples/run_xnli.py b/examples/run_xnli.py index 74bf295b69..9faba294dd 100644 --- a/examples/run_xnli.py +++ b/examples/run_xnli.py @@ -26,8 +26,7 @@ import random import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler try: @@ -37,10 +36,18 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, - BertConfig, BertForSequenceClassification, BertTokenizer, - XLMConfig, XLMForSequenceClassification, XLMTokenizer, - DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForSequenceClassification, + BertTokenizer, + XLMConfig, + XLMForSequenceClassification, + XLMTokenizer, + DistilBertConfig, + DistilBertForSequenceClassification, + DistilBertTokenizer, +) from transformers import AdamW, get_linear_schedule_with_warmup @@ -52,12 +59,14 @@ from transformers import glue_convert_examples_to_features as convert_examples_t logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), ()) +ALL_MODELS = sum( + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, DistilBertConfig, XLMConfig)), () +) MODEL_CLASSES = { - 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), - 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer) + "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), + "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), + "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), } @@ -85,19 +94,26 @@ def train(args, train_dataset, model, tokenizer): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) # Check if saved optimizer or scheduler states exist - if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')): + if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( + os.path.join(args.model_name_or_path, "scheduler.pt") + ): # Load in optimizer and scheduler states - optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt'))) - scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt'))) + optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) + scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: @@ -112,17 +128,21 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -132,7 +152,7 @@ def train(args, train_dataset, model, tokenizer): # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path - global_step = int(args.model_name_or_path.split('-')[-1].split('/')[0]) + global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) @@ -143,7 +163,9 @@ def train(args, train_dataset, model, tokenizer): tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() - train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) + train_iterator = trange( + epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] + ) set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) @@ -155,16 +177,16 @@ def train(args, train_dataset, model, tokenizer): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert"] else None + ) # XLM and DistilBERT don't use segment_ids outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training + loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -188,28 +210,32 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + tb_writer.add_scalar("eval_{}".format(key), value, global_step) + tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) + tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) - torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt')) - torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt')) + torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) + torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -258,11 +284,11 @@ def evaluate(args, model, tokenizer, prefix=""): batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'labels': batch[3]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = batch[2] if args.model_type in ['bert'] else None # XLM and DistilBERT don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = ( + batch[2] if args.model_type in ["bert"] else None + ) # XLM and DistilBERT don't use segment_ids outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] @@ -270,16 +296,16 @@ def evaluate(args, model, tokenizer, prefix=""): nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() + out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) else: - raise ValueError('No other `output_mode` for XNLI.') + raise ValueError("No other `output_mode` for XNLI.") result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) @@ -300,27 +326,34 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task](language=args.language, train_language=args.train_language) output_mode = output_modes[task] # Load data features from cache or dataset file - cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}_{}'.format( - 'test' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length), - str(task), - str(args.train_language if (not evaluate and args.train_language is not None) else args.language))) + cached_features_file = os.path.join( + args.data_dir, + "cached_{}_{}_{}_{}_{}".format( + "test" if evaluate else "train", + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + str(task), + str(args.train_language if (not evaluate and args.train_language is not None) else args.language), + ), + ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() - examples = processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = convert_examples_to_features(examples, - tokenizer, - label_list=label_list, - max_length=args.max_seq_length, - output_mode=output_mode, - pad_on_left=False, - pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], - pad_token_segment_id=0, + examples = ( + processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + ) + features = convert_examples_to_features( + examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=False, + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) @@ -336,7 +369,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) else: - raise ValueError('No other `output_mode` for XNLI.') + raise ValueError("No other `output_mode` for XNLI.") dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset @@ -346,92 +379,152 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--data_dir", default=None, type=str, required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--language", default=None, type=str, required=True, - help="Evaluation language. Also train language if `train_language` is set to None.") - parser.add_argument("--train_language", default=None, type=str, - help="Train language if is different of the evaluation language.") - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model predictions and checkpoints will be written.") + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--language", + default=None, + type=str, + required=True, + help="Evaluation language. Also train language if `train_language` is set to None.", + ) + parser.add_argument( + "--train_language", default=None, type=str, help="Train language if is different of the evaluation language." + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") - parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the test set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument("--local_rank", type=int, default=-1, - help="For distributed training: local_rank") - parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -443,22 +536,30 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) # Prepare XNLI task - args.task_name = 'xnli' + args.task_name = "xnli" if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name](language=args.language, train_language=args.train_language) @@ -472,17 +573,23 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - num_labels=num_labels, - finetuning_task=args.task_name, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + num_labels=num_labels, + finetuning_task=args.task_name, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -491,14 +598,12 @@ def main(): logger.info("Training/evaluation parameters %s", args) - # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -508,36 +613,39 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) - # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" - prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" - + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" + model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) - result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results diff --git a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py index 33b17bfb6f..d32e6fc06c 100644 --- a/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py +++ b/examples/summarization/convert_bertabs_original_pytorch_checkpoint.py @@ -34,12 +34,30 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -SAMPLE_TEXT = 'Hello world! cécé herlolip' +SAMPLE_TEXT = "Hello world! cécé herlolip" BertAbsConfig = namedtuple( "BertAbsConfig", - ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"], + [ + "temp_dir", + "large", + "use_bert_emb", + "finetune_bert", + "encoder", + "share_emb", + "max_pos", + "enc_layers", + "enc_hidden_size", + "enc_heads", + "enc_ff_size", + "enc_dropout", + "dec_layers", + "dec_hidden_size", + "dec_heads", + "dec_ff_size", + "dec_dropout", + ], ) @@ -119,7 +137,9 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path): output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0] output_original_generator = original.generator(output_original_model) - output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0] + output_converted_model = new_model( + encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask + )[0] output_converted_generator = new_model.generator(output_converted_model) maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item() @@ -136,28 +156,21 @@ def convert_bertabs_checkpoints(path_to_checkpoints, dump_path): # The model has been saved with torch.save(model) and this is bound to the exact # directory structure. We save the state_dict instead. logging.info("saving the model's state dictionary") - torch.save(new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin") + torch.save( + new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin" + ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--bertabs_checkpoint_path", - default=None, - type=str, - required=True, - help="Path the official PyTorch dump.", + "--bertabs_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump.", ) parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model.", + "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model.", ) args = parser.parse_args() convert_bertabs_checkpoints( - args.bertabs_checkpoint_path, - args.pytorch_dump_folder_path, + args.bertabs_checkpoint_path, args.pytorch_dump_folder_path, ) diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py index 5bf1599ad2..d4d8c6648d 100644 --- a/examples/summarization/modeling_bertabs.py +++ b/examples/summarization/modeling_bertabs.py @@ -56,40 +56,22 @@ class BertAbs(BertAbsPreTrainedModel): load_bert_pretrained_extractive = True if bert_extractive_checkpoint else False if load_bert_pretrained_extractive: self.bert.model.load_state_dict( - dict( - [ - (n[11:], p) - for n, p in bert_extractive_checkpoint.items() - if n.startswith("bert.model") - ] - ), + dict([(n[11:], p) for n, p in bert_extractive_checkpoint.items() if n.startswith("bert.model")]), strict=True, ) self.vocab_size = self.bert.model.config.vocab_size if args.max_pos > 512: - my_pos_embeddings = nn.Embedding( - args.max_pos, self.bert.model.config.hidden_size - ) - my_pos_embeddings.weight.data[ - :512 - ] = self.bert.model.embeddings.position_embeddings.weight.data - my_pos_embeddings.weight.data[ - 512: - ] = self.bert.model.embeddings.position_embeddings.weight.data[-1][ + my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size) + my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data + my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][ None, : - ].repeat( - args.max_pos - 512, 1 - ) + ].repeat(args.max_pos - 512, 1) self.bert.model.embeddings.position_embeddings = my_pos_embeddings - tgt_embeddings = nn.Embedding( - self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0 - ) + tgt_embeddings = nn.Embedding(self.vocab_size, self.bert.model.config.hidden_size, padding_idx=0) - tgt_embeddings.weight = copy.deepcopy( - self.bert.model.embeddings.word_embeddings.weight - ) + tgt_embeddings.weight = copy.deepcopy(self.bert.model.embeddings.word_embeddings.weight) self.decoder = TransformerDecoder( self.args.dec_layers, @@ -102,9 +84,7 @@ class BertAbs(BertAbsPreTrainedModel): ) gen_func = nn.LogSoftmax(dim=-1) - self.generator = nn.Sequential( - nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func - ) + self.generator = nn.Sequential(nn.Linear(args.dec_hidden_size, args.vocab_size), gen_func) self.generator[0].weight = self.decoder.embeddings.weight load_from_checkpoints = False if checkpoint is None else True @@ -127,25 +107,14 @@ class BertAbs(BertAbsPreTrainedModel): p.data.zero_() def forward( - self, - encoder_input_ids, - decoder_input_ids, - token_type_ids, - encoder_attention_mask, - decoder_attention_mask, + self, encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask, ): encoder_output = self.bert( - input_ids=encoder_input_ids, - token_type_ids=token_type_ids, - attention_mask=encoder_attention_mask, + input_ids=encoder_input_ids, token_type_ids=token_type_ids, attention_mask=encoder_attention_mask, ) encoder_hidden_states = encoder_output[0] - dec_state = self.decoder.init_decoder_state( - encoder_input_ids, encoder_hidden_states - ) - decoder_outputs, _ = self.decoder( - decoder_input_ids[:, :-1], encoder_hidden_states, dec_state - ) + dec_state = self.decoder.init_decoder_state(encoder_input_ids, encoder_hidden_states) + decoder_outputs, _ = self.decoder(decoder_input_ids[:, :-1], encoder_hidden_states, dec_state) return decoder_outputs @@ -162,10 +131,7 @@ class Bert(nn.Module): self.eval() with torch.no_grad(): encoder_outputs, _ = self.model( - input_ids, - token_type_ids=token_type_ids, - attention_mask=attention_mask, - **kwargs + input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, **kwargs ) return encoder_outputs @@ -196,10 +162,7 @@ class TransformerDecoder(nn.Module): # Build TransformerDecoder. self.transformer_layers = nn.ModuleList( - [ - TransformerDecoderLayer(d_model, heads, d_ff, dropout) - for _ in range(num_layers) - ] + [TransformerDecoderLayer(d_model, heads, d_ff, dropout) for _ in range(num_layers)] ) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) @@ -236,20 +199,14 @@ class TransformerDecoder(nn.Module): # Decoder padding mask tgt_words = tgt tgt_batch, tgt_len = tgt_words.size() - tgt_pad_mask = ( - tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len) - ) + tgt_pad_mask = tgt_words.data.eq(padding_idx).unsqueeze(1).expand(tgt_batch, tgt_len, tgt_len) # Encoder padding mask if memory_mask is not None: src_len = memory_mask.size(-1) src_pad_mask = memory_mask.expand(src_batch, tgt_len, src_len) else: - src_pad_mask = ( - src_words.data.eq(padding_idx) - .unsqueeze(1) - .expand(src_batch, tgt_len, src_len) - ) + src_pad_mask = src_words.data.eq(padding_idx).unsqueeze(1).expand(src_batch, tgt_len, src_len) # Pass through the embeddings emb = self.embeddings(input_ids) @@ -271,9 +228,7 @@ class TransformerDecoder(nn.Module): src_pad_mask, tgt_pad_mask, previous_input=prev_layer_input, - layer_cache=state.cache["layer_{}".format(i)] - if state.cache is not None - else None, + layer_cache=state.cache["layer_{}".format(i)] if state.cache is not None else None, step=step, ) if state.cache is None: @@ -303,9 +258,7 @@ class PositionalEncoding(nn.Module): def __init__(self, dropout, dim, max_len=5000): pe = torch.zeros(max_len, dim) position = torch.arange(0, max_len).unsqueeze(1) - div_term = torch.exp( - (torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim)) - ) + div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) * -(math.log(10000.0) / dim))) pe[:, 0::2] = torch.sin(position.float() * div_term) pe[:, 1::2] = torch.cos(position.float() * div_term) pe = pe.unsqueeze(0) @@ -356,14 +309,7 @@ class TransformerDecoderLayer(nn.Module): self.register_buffer("mask", mask) def forward( - self, - inputs, - memory_bank, - src_pad_mask, - tgt_pad_mask, - previous_input=None, - layer_cache=None, - step=None, + self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, previous_input=None, layer_cache=None, step=None, ): """ Args: @@ -380,34 +326,20 @@ class TransformerDecoderLayer(nn.Module): * all_input `[batch_size x current_step x model_dim]` """ - dec_mask = torch.gt( - tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0 - ) + dec_mask = torch.gt(tgt_pad_mask + self.mask[:, : tgt_pad_mask.size(1), : tgt_pad_mask.size(1)], 0) input_norm = self.layer_norm_1(inputs) all_input = input_norm if previous_input is not None: all_input = torch.cat((previous_input, input_norm), dim=1) dec_mask = None - query = self.self_attn( - all_input, - all_input, - input_norm, - mask=dec_mask, - layer_cache=layer_cache, - type="self", - ) + query = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self",) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid = self.context_attn( - memory_bank, - memory_bank, - query_norm, - mask=src_pad_mask, - layer_cache=layer_cache, - type="context", + memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context", ) output = self.feed_forward(self.drop(mid) + query) @@ -492,14 +424,7 @@ class MultiHeadedAttention(nn.Module): self.final_linear = nn.Linear(model_dim, model_dim) def forward( - self, - key, - value, - query, - mask=None, - layer_cache=None, - type=None, - predefined_graph_1=None, + self, key, value, query, mask=None, layer_cache=None, type=None, predefined_graph_1=None, ): """ Compute the context vector and the attention vectors. @@ -531,11 +456,7 @@ class MultiHeadedAttention(nn.Module): def unshape(x): """ compute context """ - return ( - x.transpose(1, 2) - .contiguous() - .view(batch_size, -1, head_count * dim_per_head) - ) + return x.transpose(1, 2).contiguous().view(batch_size, -1, head_count * dim_per_head) # 1) Project key, value, and query. if layer_cache is not None: @@ -554,9 +475,7 @@ class MultiHeadedAttention(nn.Module): if layer_cache["self_keys"] is not None: key = torch.cat((layer_cache["self_keys"].to(device), key), dim=2) if layer_cache["self_values"] is not None: - value = torch.cat( - (layer_cache["self_values"].to(device), value), dim=2 - ) + value = torch.cat((layer_cache["self_values"].to(device), value), dim=2) layer_cache["self_keys"] = key layer_cache["self_values"] = value elif type == "context": @@ -637,13 +556,9 @@ class DecoderState(object): sizes = e.size() br = sizes[1] if len(sizes) == 3: - sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[ - :, :, idx - ] + sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2])[:, :, idx] else: - sent_states = e.view( - sizes[0], beam_size, br // beam_size, sizes[2], sizes[3] - )[:, :, idx] + sent_states = e.view(sizes[0], beam_size, br // beam_size, sizes[2], sizes[3])[:, :, idx] sent_states.data.copy_(sent_states.data.index_select(1, positions)) @@ -716,11 +631,7 @@ class TransformerDecoderState(DecoderState): def gelu(x): - return ( - 0.5 - * x - * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - ) + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) class PositionwiseFeedForward(nn.Module): @@ -758,9 +669,7 @@ class PositionwiseFeedForward(nn.Module): def build_predictor(args, tokenizer, symbols, model, logger=None): # we should be able to refactor the global scorer a lot scorer = GNMTGlobalScorer(args.alpha, length_penalty="wu") - translator = Translator( - args, model, tokenizer, symbols, global_scorer=scorer, logger=logger - ) + translator = Translator(args, model, tokenizer, symbols, global_scorer=scorer, logger=logger) return translator @@ -891,9 +800,7 @@ class Translator(object): Shouldn't need the original dataset. """ with torch.no_grad(): - return self._fast_translate_batch( - batch, self.max_length, min_length=self.min_length - ) + return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length) # Where the beam search lives # I have no idea why it is being called from the method above @@ -912,26 +819,18 @@ class Translator(object): mask_src = batch.mask_src src_features = self.model.bert(src, segs, mask_src) - dec_states = self.model.decoder.init_decoder_state( - src, src_features, with_cache=True - ) + dec_states = self.model.decoder.init_decoder_state(src, src_features, with_cache=True) device = src_features.device # Tile states and memory beam_size times. dec_states.map_batch_fn(lambda state, dim: tile(state, beam_size, dim=dim)) src_features = tile(src_features, beam_size, dim=0) batch_offset = torch.arange(batch_size, dtype=torch.long, device=device) - beam_offset = torch.arange( - 0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device - ) - alive_seq = torch.full( - [batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device - ) + beam_offset = torch.arange(0, batch_size * beam_size, step=beam_size, dtype=torch.long, device=device) + alive_seq = torch.full([batch_size * beam_size, 1], self.start_token, dtype=torch.long, device=device) # Give full probability to the first beam on the first step. - topk_log_probs = torch.tensor( - [0.0] + [float("-inf")] * (beam_size - 1), device=device - ).repeat(batch_size) + topk_log_probs = torch.tensor([0.0] + [float("-inf")] * (beam_size - 1), device=device).repeat(batch_size) # Structure that holds finished hypotheses. hypotheses = [[] for _ in range(batch_size)] # noqa: F812 @@ -948,9 +847,7 @@ class Translator(object): # Decoder forward. decoder_input = decoder_input.transpose(0, 1) - dec_out, dec_states = self.model.decoder( - decoder_input, src_features, dec_states, step=step - ) + dec_out, dec_states = self.model.decoder(decoder_input, src_features, dec_states, step=step) # Generator forward. log_probs = self.generator.forward(dec_out.transpose(0, 1).squeeze(0)) @@ -978,10 +875,7 @@ class Translator(object): words = " ".join(words).replace(" ##", "").split() if len(words) <= 3: continue - trigrams = [ - (words[i - 1], words[i], words[i + 1]) - for i in range(1, len(words) - 1) - ] + trigrams = [(words[i - 1], words[i], words[i + 1]) for i in range(1, len(words) - 1)] trigram = tuple(trigrams[-1]) if trigram in trigrams[:-1]: fail = True @@ -999,15 +893,11 @@ class Translator(object): topk_ids = topk_ids.fmod(vocab_size) # Map beam_index to batch_index in the flat representation. - batch_index = topk_beam_index + beam_offset[ - : topk_beam_index.size(0) - ].unsqueeze(1) + batch_index = topk_beam_index + beam_offset[: topk_beam_index.size(0)].unsqueeze(1) select_indices = batch_index.view(-1) # Append last prediction. - alive_seq = torch.cat( - [alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1 - ) + alive_seq = torch.cat([alive_seq.index_select(0, select_indices), topk_ids.view(-1, 1)], -1) is_finished = topk_ids.eq(self.end_token) if step + 1 == max_length: @@ -1040,15 +930,11 @@ class Translator(object): topk_log_probs = topk_log_probs.index_select(0, non_finished) batch_index = batch_index.index_select(0, non_finished) batch_offset = batch_offset.index_select(0, non_finished) - alive_seq = predictions.index_select(0, non_finished).view( - -1, alive_seq.size(-1) - ) + alive_seq = predictions.index_select(0, non_finished).view(-1, alive_seq.size(-1)) # Reorder states. select_indices = batch_index.view(-1) src_features = src_features.index_select(0, select_indices) - dec_states.map_batch_fn( - lambda state, dim: state.index_select(dim, select_indices) - ) + dec_states.map_batch_fn(lambda state, dim: state.index_select(dim, select_indices)) return results @@ -1089,14 +975,7 @@ def tile(x, count, dim=0): out_size = list(x.size()) out_size[0] *= count batch = x.size(0) - x = ( - x.view(batch, -1) - .transpose(0, 1) - .repeat(count, 1) - .transpose(0, 1) - .contiguous() - .view(*out_size) - ) + x = x.view(batch, -1).transpose(0, 1).repeat(count, 1).transpose(0, 1).contiguous().view(*out_size) if dim != 0: x = x.permute(perm).contiguous() return x @@ -1107,6 +986,7 @@ def tile(x, count, dim=0): # a finetuning script. # + class BertSumOptimizer(object): """ Specific optimizer for BertSum. @@ -1126,16 +1006,10 @@ class BertSumOptimizer(object): self.optimizers = { "encoder": torch.optim.Adam( - model.encoder.parameters(), - lr=lr["encoder"], - betas=(beta_1, beta_2), - eps=eps, + model.encoder.parameters(), lr=lr["encoder"], betas=(beta_1, beta_2), eps=eps, ), "decoder": torch.optim.Adam( - model.decoder.parameters(), - lr=lr["decoder"], - betas=(beta_1, beta_2), - eps=eps, + model.decoder.parameters(), lr=lr["decoder"], betas=(beta_1, beta_2), eps=eps, ), } @@ -1143,9 +1017,7 @@ class BertSumOptimizer(object): self.current_learning_rates = {} def _update_rate(self, stack): - return self.lr[stack] * min( - self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5) - ) + return self.lr[stack] * min(self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-1.5)) def zero_grad(self): self.optimizer_decoder.zero_grad() diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index 3c339d0c30..36210d999d 100644 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -25,9 +25,7 @@ logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout, level=logging.INFO) -Batch = namedtuple( - "Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"] -) +Batch = namedtuple("Batch", ["document_names", "batch_size", "src", "segs", "mask_src", "tgt_str"]) def evaluate(args): @@ -48,13 +46,14 @@ def evaluate(args): import rouge import nltk - nltk.download('punkt') + + nltk.download("punkt") rouge_evaluator = rouge.Rouge( - metrics=['rouge-n', 'rouge-l'], + metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=True, length_limit=args.beam_size, - length_limit_type='words', + length_limit_type="words", apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score @@ -161,15 +160,15 @@ Recall >> {:.3f} F1 >> {:.3f} Precision >> {:.3f} Recall >> {:.3f}""".format( - scores['rouge-1']['f'], - scores['rouge-1']['p'], - scores['rouge-1']['r'], - scores['rouge-2']['f'], - scores['rouge-2']['p'], - scores['rouge-2']['r'], - scores['rouge-l']['f'], - scores['rouge-l']['p'], - scores['rouge-l']['r'], + scores["rouge-1"]["f"], + scores["rouge-1"]["p"], + scores["rouge-1"]["r"], + scores["rouge-2"]["f"], + scores["rouge-2"]["p"], + scores["rouge-2"]["r"], + scores["rouge-l"]["f"], + scores["rouge-l"]["p"], + scores["rouge-l"]["r"], ) @@ -187,9 +186,7 @@ def build_data_iterator(args, tokenizer): dataset = load_and_cache_examples(args, tokenizer) sampler = SequentialSampler(dataset) collate_fn = lambda data: collate(data, tokenizer, block_size=512, device=args.device) - iterator = DataLoader( - dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn, - ) + iterator = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate_fn,) return iterator @@ -210,14 +207,9 @@ def collate(data, tokenizer, block_size, device): names = [name for name, _, _ in data] summaries = [" ".join(summary_list) for _, _, summary_list in data] - encoded_text = [ - encode_for_summarization(story, summary, tokenizer) for _, story, summary in data - ] + encoded_text = [encode_for_summarization(story, summary, tokenizer) for _, story, summary in data] encoded_stories = torch.tensor( - [ - fit_to_block_size(story, block_size, tokenizer.pad_token_id) - for story, _ in encoded_text - ] + [fit_to_block_size(story, block_size, tokenizer.pad_token_id) for story, _ in encoded_text] ) encoder_token_type_ids = compute_token_type_ids(encoded_stories, tokenizer.cls_token_id) encoder_mask = build_mask(encoded_stories, tokenizer.pad_token_id) @@ -272,38 +264,23 @@ def main(): ) # EVALUATION options parser.add_argument( - "--no_cuda", - default=False, - type=bool, - help="Whether to force the execution on CPU.", + "--no_cuda", default=False, type=bool, help="Whether to force the execution on CPU.", ) parser.add_argument( "--batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.", ) # BEAM SEARCH arguments parser.add_argument( - "--min_length", - default=50, - type=int, - help="Minimum number of tokens for the summaries.", + "--min_length", default=50, type=int, help="Minimum number of tokens for the summaries.", ) parser.add_argument( - "--max_length", - default=200, - type=int, - help="Maixmum number of tokens for the summaries.", + "--max_length", default=200, type=int, help="Maixmum number of tokens for the summaries.", ) parser.add_argument( - "--beam_size", - default=5, - type=int, - help="The number of beams to start with for each example.", + "--beam_size", default=5, type=int, help="The number of beams to start with for each example.", ) parser.add_argument( - "--alpha", - default=0.95, - type=float, - help="The value of alpha for the length penalty in the beam search.", + "--alpha", default=0.95, type=float, help="The value of alpha for the length penalty in the beam search.", ) parser.add_argument( "--block_trigram", diff --git a/examples/summarization/utils_summarization.py b/examples/summarization/utils_summarization.py index 1d8c436ac9..96470f47a2 100644 --- a/examples/summarization/utils_summarization.py +++ b/examples/summarization/utils_summarization.py @@ -68,9 +68,7 @@ def process_story(raw_story): Raises: IndexError: If the stoy is empty or contains no highlights. """ - nonempty_lines = list( - filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]) - ) + nonempty_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])) # for some unknown reason some lines miss a period, add it nonempty_lines = [_add_missing_period(line) for line in nonempty_lines] @@ -135,13 +133,9 @@ def encode_for_summarization(story_lines, summary_lines, tokenizer): sentences. """ story_lines_token_ids = [tokenizer.encode(line) for line in story_lines] - story_token_ids = [ - token for sentence in story_lines_token_ids for token in sentence - ] + story_token_ids = [token for sentence in story_lines_token_ids for token in sentence] summary_lines_token_ids = [tokenizer.encode(line) for line in summary_lines] - summary_token_ids = [ - token for sentence in summary_lines_token_ids for token in sentence - ] + summary_token_ids = [token for sentence in summary_lines_token_ids for token in sentence] return story_token_ids, summary_token_ids diff --git a/examples/summarization/utils_summarization_test.py b/examples/summarization/utils_summarization_test.py index 8bfbf6ab23..253eae388d 100644 --- a/examples/summarization/utils_summarization_test.py +++ b/examples/summarization/utils_summarization_test.py @@ -33,25 +33,19 @@ class SummarizationDataProcessingTest(unittest.TestCase): """ Pad the sequence with 0 if the sequence is smaller than the block size.""" sequence = [1, 2, 3, 4] expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0] - self.assertEqual( - fit_to_block_size(sequence, self.block_size, 0), expected_output - ) + self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output) def test_fit_to_block_sequence_fit_exactly(self): """ Do nothing if the sequence is the right size. """ sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - self.assertEqual( - fit_to_block_size(sequence, self.block_size, 0), expected_output - ) + self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output) def test_fit_to_block_sequence_too_big(self): """ Truncate the sequence if it is too long. """ sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - self.assertEqual( - fit_to_block_size(sequence, self.block_size, 0), expected_output - ) + self.assertEqual(fit_to_block_size(sequence, self.block_size, 0), expected_output) def test_process_story_no_highlights(self): """ Processing a story with no highlights returns an empty list for the summary. @@ -95,9 +89,7 @@ class SummarizationDataProcessingTest(unittest.TestCase): def test_build_mask(self): sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23]) expected = torch.tensor([1, 1, 1, 1, 0, 0, 0]) - np.testing.assert_array_equal( - build_mask(sequence, 23).numpy(), expected.numpy() - ) + np.testing.assert_array_equal(build_mask(sequence, 23).numpy(), expected.numpy()) def test_build_mask_with_padding_equal_to_one(self): sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1]) @@ -106,12 +98,8 @@ class SummarizationDataProcessingTest(unittest.TestCase): def test_compute_token_type_ids(self): separator = 101 - batch = torch.tensor( - [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]] - ) - expected = torch.tensor( - [[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]] - ) + batch = torch.tensor([[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]) + expected = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1]]) result = compute_token_type_ids(batch, separator) np.testing.assert_array_equal(result, expected) diff --git a/examples/test_examples.py b/examples/test_examples.py index 632d2f728e..1293559c26 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -35,34 +35,36 @@ logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger() + def get_setup_file(): parser = argparse.ArgumentParser() - parser.add_argument('-f') + parser.add_argument("-f") args = parser.parse_args() return args.f -class ExamplesTests(unittest.TestCase): +class ExamplesTests(unittest.TestCase): def test_run_glue(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) - testargs = ["run_glue.py", - "--data_dir=./examples/tests_samples/MRPC/", - "--task_name=mrpc", - "--do_train", - "--do_eval", - "--output_dir=./examples/tests_samples/temp_dir", - "--per_gpu_train_batch_size=2", - "--per_gpu_eval_batch_size=1", - "--learning_rate=1e-4", - "--max_steps=10", - "--warmup_steps=2", - "--overwrite_output_dir", - "--seed=42"] - model_type, model_name = ("--model_type=bert", - "--model_name_or_path=bert-base-uncased") - with patch.object(sys, 'argv', testargs + [model_type, model_name]): + testargs = [ + "run_glue.py", + "--data_dir=./examples/tests_samples/MRPC/", + "--task_name=mrpc", + "--do_train", + "--do_eval", + "--output_dir=./examples/tests_samples/temp_dir", + "--per_gpu_train_batch_size=2", + "--per_gpu_eval_batch_size=1", + "--learning_rate=1e-4", + "--max_steps=10", + "--warmup_steps=2", + "--overwrite_output_dir", + "--seed=42", + ] + model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased") + with patch.object(sys, "argv", testargs + [model_type, model_name]): result = run_glue.main() for value in result.values(): self.assertGreaterEqual(value, 0.75) @@ -71,40 +73,38 @@ class ExamplesTests(unittest.TestCase): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) - testargs = ["run_squad.py", - "--data_dir=./examples/tests_samples/SQUAD", - "--model_name=bert-base-uncased", - "--output_dir=./examples/tests_samples/temp_dir", - "--max_steps=10", - "--warmup_steps=2", - "--do_train", - "--do_eval", - "--version_2_with_negative", - "--learning_rate=2e-4", - "--per_gpu_train_batch_size=2", - "--per_gpu_eval_batch_size=1", - "--overwrite_output_dir", - "--seed=42"] - model_type, model_name = ("--model_type=bert", - "--model_name_or_path=bert-base-uncased") - with patch.object(sys, 'argv', testargs + [model_type, model_name]): + testargs = [ + "run_squad.py", + "--data_dir=./examples/tests_samples/SQUAD", + "--model_name=bert-base-uncased", + "--output_dir=./examples/tests_samples/temp_dir", + "--max_steps=10", + "--warmup_steps=2", + "--do_train", + "--do_eval", + "--version_2_with_negative", + "--learning_rate=2e-4", + "--per_gpu_train_batch_size=2", + "--per_gpu_eval_batch_size=1", + "--overwrite_output_dir", + "--seed=42", + ] + model_type, model_name = ("--model_type=bert", "--model_name_or_path=bert-base-uncased") + with patch.object(sys, "argv", testargs + [model_type, model_name]): result = run_squad.main() - self.assertGreaterEqual(result['f1'], 30) - self.assertGreaterEqual(result['exact'], 30) + self.assertGreaterEqual(result["f1"], 30) + self.assertGreaterEqual(result["exact"], 30) def test_generation(self): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) - testargs = ["run_generation.py", - "--prompt=Hello", - "--length=10", - "--seed=42"] - model_type, model_name = ("--model_type=openai-gpt", - "--model_name_or_path=openai-gpt") - with patch.object(sys, 'argv', testargs + [model_type, model_name]): + testargs = ["run_generation.py", "--prompt=Hello", "--length=10", "--seed=42"] + model_type, model_name = ("--model_type=openai-gpt", "--model_name_or_path=openai-gpt") + with patch.object(sys, "argv", testargs + [model_type, model_name]): result = run_generation.main() self.assertGreaterEqual(len(result), 10) + if __name__ == "__main__": unittest.main() diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py index a131a63924..492eb23e35 100644 --- a/examples/utils_multiple_choice.py +++ b/examples/utils_multiple_choice.py @@ -55,19 +55,10 @@ class InputExample(object): class InputFeatures(object): - def __init__(self, - example_id, - choices_features, - label - - ): + def __init__(self, example_id, choices_features, label): self.example_id = example_id self.choices_features = [ - { - 'input_ids': input_ids, - 'input_mask': input_mask, - 'segment_ids': segment_ids - } + {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids} for input_ids, input_mask, segment_ids in choices_features ] self.label = label @@ -99,29 +90,29 @@ class RaceProcessor(DataProcessor): def get_train_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} train".format(data_dir)) - high = os.path.join(data_dir, 'train/high') - middle = os.path.join(data_dir, 'train/middle') + high = os.path.join(data_dir, "train/high") + middle = os.path.join(data_dir, "train/middle") high = self._read_txt(high) middle = self._read_txt(middle) - return self._create_examples(high + middle, 'train') + return self._create_examples(high + middle, "train") def get_dev_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} dev".format(data_dir)) - high = os.path.join(data_dir, 'dev/high') - middle = os.path.join(data_dir, 'dev/middle') + high = os.path.join(data_dir, "dev/high") + middle = os.path.join(data_dir, "dev/middle") high = self._read_txt(high) middle = self._read_txt(middle) - return self._create_examples(high + middle, 'dev') + return self._create_examples(high + middle, "dev") def get_test_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {} test".format(data_dir)) - high = os.path.join(data_dir, 'test/high') - middle = os.path.join(data_dir, 'test/middle') + high = os.path.join(data_dir, "test/high") + middle = os.path.join(data_dir, "test/middle") high = self._read_txt(high) middle = self._read_txt(middle) - return self._create_examples(high + middle, 'test') + return self._create_examples(high + middle, "test") def get_labels(self): """See base class.""" @@ -131,13 +122,12 @@ class RaceProcessor(DataProcessor): lines = [] files = glob.glob(input_dir + "/*txt") for file in tqdm.tqdm(files, desc="read files"): - with open(file, 'r', encoding='utf-8') as fin: + with open(file, "r", encoding="utf-8") as fin: data_raw = json.load(fin) data_raw["race_id"] = file lines.append(data_raw) return lines - def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] @@ -145,19 +135,22 @@ class RaceProcessor(DataProcessor): race_id = "%s-%s" % (set_type, data_raw["race_id"]) article = data_raw["article"] for i in range(len(data_raw["answers"])): - truth = str(ord(data_raw['answers'][i]) - ord('A')) - question = data_raw['questions'][i] - options = data_raw['options'][i] + truth = str(ord(data_raw["answers"][i]) - ord("A")) + question = data_raw["questions"][i] + options = data_raw["options"][i] examples.append( InputExample( example_id=race_id, question=question, - contexts=[article, article, article, article], # this is not efficient but convenient + contexts=[article, article, article, article], # this is not efficient but convenient endings=[options[0], options[1], options[2], options[3]], - label=truth)) + label=truth, + ) + ) return examples + class SwagProcessor(DataProcessor): """Processor for the SWAG data set.""" @@ -179,27 +172,25 @@ class SwagProcessor(DataProcessor): "setting!" ) return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test") + def get_labels(self): """See base class.""" return ["0", "1", "2", "3"] def _read_csv(self, input_file): - with open(input_file, 'r', encoding='utf-8') as f: + with open(input_file, "r", encoding="utf-8") as f: reader = csv.reader(f) lines = [] for line in reader: if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) + line = list(unicode(cell, "utf-8") for cell in line) lines.append(line) return lines - def _create_examples(self, lines: List[List[str]], type: str): """Creates examples for the training and dev sets.""" - if type == "train" and lines[0][-1] != 'label': - raise ValueError( - "For training, the input file must contain a label column." - ) + if type == "train" and lines[0][-1] != "label": + raise ValueError("For training, the input file must contain a label column.") examples = [ InputExample( @@ -207,10 +198,11 @@ class SwagProcessor(DataProcessor): question=line[5], # in the swag dataset, the # common beginning of each # choice is stored in "sent2". - contexts = [line[4], line[4], line[4], line[4]], - endings = [line[7], line[8], line[9], line[10]], - label=line[11] - ) for line in lines[1:] # we skip the line with the column names + contexts=[line[4], line[4], line[4], line[4]], + endings=[line[7], line[8], line[9], line[10]], + label=line[11], + ) + for line in lines[1:] # we skip the line with the column names ] return examples @@ -238,15 +230,14 @@ class ArcProcessor(DataProcessor): return ["0", "1", "2", "3"] def _read_json(self, input_file): - with open(input_file, 'r', encoding='utf-8') as fin: + with open(input_file, "r", encoding="utf-8") as fin: lines = fin.readlines() return lines - def _create_examples(self, lines, type): """Creates examples for the training and dev sets.""" - #There are two types of labels. They should be normalized + # There are two types of labels. They should be normalized def normalize(truth): if truth in "ABCD": return ord(truth) - ord("A") @@ -283,12 +274,18 @@ class ArcProcessor(DataProcessor): if len(options) == 4: examples.append( InputExample( - example_id = id, + example_id=id, question=question, - contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""), - options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")], + contexts=[ + options[0]["para"].replace("_", ""), + options[1]["para"].replace("_", ""), + options[2]["para"].replace("_", ""), + options[3]["para"].replace("_", ""), + ], endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]], - label=truth)) + label=truth, + ) + ) if type == "train": assert len(examples) > 1 @@ -316,7 +313,7 @@ def convert_examples_to_features( Loads a data file into a list of `InputFeatures` """ - label_map = {label : i for i, label in enumerate(label_list)} + label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"): @@ -331,16 +328,13 @@ def convert_examples_to_features( else: text_b = example.question + " " + ending - inputs = tokenizer.encode_plus( - text_a, - text_b, - add_special_tokens=True, - max_length=max_length, - ) - if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0: - logger.info('Attention! you are cropping tokens (swag task is ok). ' - 'If you are training ARC and RACE and you are poping question + options,' - 'you need to try to use a bigger max seq length!') + inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length,) + if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: + logger.info( + "Attention! you are cropping tokens (swag task is ok). " + "If you are training ARC and RACE and you are poping question + options," + "you need to try to use a bigger max seq length!" + ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] @@ -364,7 +358,6 @@ def convert_examples_to_features( assert len(token_type_ids) == max_length choices_features.append((input_ids, attention_mask, token_type_ids)) - label = label_map[example.label] if ex_index < 2: @@ -372,33 +365,17 @@ def convert_examples_to_features( logger.info("race_id: {}".format(example.example_id)) for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features): logger.info("choice: {}".format(choice_idx)) - logger.info("input_ids: {}".format(' '.join(map(str, input_ids)))) - logger.info("attention_mask: {}".format(' '.join(map(str, attention_mask)))) - logger.info("token_type_ids: {}".format(' '.join(map(str, token_type_ids)))) + logger.info("input_ids: {}".format(" ".join(map(str, input_ids)))) + logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask)))) + logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids)))) logger.info("label: {}".format(label)) - features.append( - InputFeatures( - example_id=example.example_id, - choices_features=choices_features, - label=label, - ) - ) + features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,)) return features +processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor} -processors = { - "race": RaceProcessor, - "swag": SwagProcessor, - "arc": ArcProcessor -} - - -MULTIPLE_CHOICE_TASKS_NUM_LABELS = { - "race", 4, - "swag", 4, - "arc", 4 -} +MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4} diff --git a/examples/utils_ner.py b/examples/utils_ner.py index 45ddeafbd5..d37583469c 100644 --- a/examples/utils_ner.py +++ b/examples/utils_ner.py @@ -61,9 +61,7 @@ def read_examples_from_file(data_dir, mode): for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": if words: - examples.append(InputExample(guid="{}-{}".format(mode, guid_index), - words=words, - labels=labels)) + examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels)) guid_index += 1 words = [] labels = [] @@ -76,27 +74,27 @@ def read_examples_from_file(data_dir, mode): # Examples could have no label for mode = "test" labels.append("O") if words: - examples.append(InputExample(guid="%s-%d".format(mode, guid_index), - words=words, - labels=labels)) + examples.append(InputExample(guid="%s-%d".format(mode, guid_index), words=words, labels=labels)) return examples -def convert_examples_to_features(examples, - label_list, - max_seq_length, - tokenizer, - cls_token_at_end=False, - cls_token="[CLS]", - cls_token_segment_id=1, - sep_token="[SEP]", - sep_token_extra=False, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - pad_token_label_id=-100, - sequence_a_segment_id=0, - mask_padding_with_zero=True): +def convert_examples_to_features( + examples, + label_list, + max_seq_length, + tokenizer, + cls_token_at_end=False, + cls_token="[CLS]", + cls_token_segment_id=1, + sep_token="[SEP]", + sep_token_extra=False, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + pad_token_label_id=-100, + sequence_a_segment_id=0, + mask_padding_with_zero=True, +): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] @@ -122,8 +120,8 @@ def convert_examples_to_features(examples, # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. special_tokens_count = 3 if sep_token_extra else 2 if len(tokens) > max_seq_length - special_tokens_count: - tokens = tokens[:(max_seq_length - special_tokens_count)] - label_ids = label_ids[:(max_seq_length - special_tokens_count)] + tokens = tokens[: (max_seq_length - special_tokens_count)] + label_ids = label_ids[: (max_seq_length - special_tokens_count)] # The convention in BERT is: # (a) For sequence pairs: @@ -174,10 +172,10 @@ def convert_examples_to_features(examples, segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids label_ids = ([pad_token_label_id] * padding_length) + label_ids else: - input_ids += ([pad_token] * padding_length) - input_mask += ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids += ([pad_token_segment_id] * padding_length) - label_ids += ([pad_token_label_id] * padding_length) + input_ids += [pad_token] * padding_length + input_mask += [0 if mask_padding_with_zero else 1] * padding_length + segment_ids += [pad_token_segment_id] * padding_length + label_ids += [pad_token_label_id] * padding_length assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length @@ -194,10 +192,8 @@ def convert_examples_to_features(examples, logger.info("label_ids: %s", " ".join([str(x) for x in label_ids])) features.append( - InputFeatures(input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_ids=label_ids)) + InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids) + ) return features @@ -209,4 +205,4 @@ def get_labels(path): labels = ["O"] + labels return labels else: - return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] + return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"] diff --git a/hubconf.py b/hubconf.py index 3fa354ed5a..1d100271ae 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,9 +1,15 @@ from transformers import ( - AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering + AutoTokenizer, + AutoConfig, + AutoModel, + AutoModelWithLMHead, + AutoModelForSequenceClassification, + AutoModelForQuestionAnswering, ) from transformers.file_utils import add_start_docstrings -dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses'] +dependencies = ["torch", "tqdm", "boto3", "requests", "regex", "sentencepiece", "sacremoses"] + @add_start_docstrings(AutoConfig.__doc__) def config(*args, **kwargs): @@ -57,6 +63,7 @@ def model(*args, **kwargs): return AutoModel.from_pretrained(*args, **kwargs) + @add_start_docstrings(AutoModelWithLMHead.__doc__) def modelWithLMHead(*args, **kwargs): r""" diff --git a/setup.py b/setup.py index fe2e1526bf..59dbfef12b 100644 --- a/setup.py +++ b/setup.py @@ -38,11 +38,11 @@ from setuptools import find_packages, setup extras = { - 'serving': ['pydantic', 'uvicorn', 'fastapi'], - 'serving-tf': ['pydantic', 'uvicorn', 'fastapi', 'tensorflow'], - 'serving-torch': ['pydantic', 'uvicorn', 'fastapi', 'torch'] + "serving": ["pydantic", "uvicorn", "fastapi"], + "serving-tf": ["pydantic", "uvicorn", "fastapi", "tensorflow"], + "serving-torch": ["pydantic", "uvicorn", "fastapi", "torch"], } -extras['all'] = [package for package in extras.values()] +extras["all"] = [package for package in extras.values()] setup( name="transformers", @@ -50,30 +50,29 @@ setup( author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", author_email="thomas@huggingface.co", description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", - long_description=open("README.md", "r", encoding='utf-8').read(), + long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", - keywords='NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU', - license='Apache', + keywords="NLP deep learning transformer pytorch tensorflow BERT GPT GPT-2 google openai CMU", + license="Apache", url="https://github.com/huggingface/transformers", - packages=find_packages(exclude=["*.tests", "*.tests.*", - "tests.*", "tests"]), - install_requires=['numpy', - 'boto3', - 'filelock', - 'requests', - 'tqdm', - 'regex != 2019.12.17', - 'sentencepiece', - 'sacremoses'], - extras_require=extras, - scripts=[ - 'transformers-cli' + packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), + install_requires=[ + "numpy", + "boto3", + "filelock", + "requests", + "tqdm", + "regex != 2019.12.17", + "sentencepiece", + "sacremoses", ], + extras_require=extras, + scripts=["transformers-cli"], # python_requires='>=3.5.0', classifiers=[ - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Artificial Intelligence", ], ) diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py index 77ce587a54..64e92f2a28 100644 --- a/templates/adding_a_new_example_script/run_xxx.py +++ b/templates/adding_a_new_example_script/run_xxx.py @@ -24,8 +24,7 @@ import glob import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data.distributed import DistributedSampler try: @@ -35,19 +34,32 @@ except: from tqdm import tqdm, trange -from transformers import (WEIGHTS_NAME, BertConfig, - BertForQuestionAnswering, BertTokenizer, - XLMConfig, XLMForQuestionAnswering, - XLMTokenizer, XLNetConfig, - XLNetForQuestionAnswering, - XLNetTokenizer, - DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) +from transformers import ( + WEIGHTS_NAME, + BertConfig, + BertForQuestionAnswering, + BertTokenizer, + XLMConfig, + XLMForQuestionAnswering, + XLMTokenizer, + XLNetConfig, + XLNetForQuestionAnswering, + XLNetTokenizer, + DistilBertConfig, + DistilBertForQuestionAnswering, + DistilBertTokenizer, +) from transformers import AdamW, get_linear_schedule_with_warmup -from utils_squad import (read_squad_examples, convert_examples_to_features, - RawResult, write_predictions, - RawResultExtended, write_predictions_extended) +from utils_squad import ( + read_squad_examples, + convert_examples_to_features, + RawResult, + write_predictions, + RawResultExtended, + write_predictions_extended, +) # The follwing import is the official SQuAD evaluation script (2.0). # You can remove it from the dependencies if you are using this script outside of the library @@ -56,16 +68,18 @@ from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad logger = logging.getLogger(__name__) -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ - for conf in (BertConfig, XLNetConfig, XLMConfig)), ()) +ALL_MODELS = sum( + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), () +) MODEL_CLASSES = { - 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), - 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), - 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), - 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) + "bert": (BertConfig, BertForQuestionAnswering, BertTokenizer), + "xlnet": (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), + "xlm": (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), + "distilbert": (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), } + def set_seed(args): random.seed(args.seed) np.random.seed(args.seed) @@ -73,9 +87,11 @@ def set_seed(args): if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) + def to_list(tensor): return tensor.detach().cpu().tolist() + def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: @@ -92,13 +108,18 @@ def train(args, train_dataset, model, tokenizer): t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] + no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, - {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], + "weight_decay": args.weight_decay, + }, + {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + scheduler = get_linear_schedule_with_warmup( + optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total + ) if args.fp16: try: from apex import amp @@ -112,17 +133,21 @@ def train(args, train_dataset, model, tokenizer): # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank, - find_unused_parameters=True) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) - logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", - args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size + * args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), + ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) @@ -136,20 +161,21 @@ def train(args, train_dataset, model, tokenizer): for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'start_positions': batch[3], - 'end_positions': batch[4]} - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] - if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[5], - 'p_mask': batch[6]}) + inputs = { + "input_ids": batch[0], + "attention_mask": batch[1], + "start_positions": batch[3], + "end_positions": batch[4], + } + if args.model_type != "distilbert": + inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2] + if args.model_type in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training + loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps @@ -173,22 +199,26 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics - if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well + if ( + args.local_rank == -1 and args.evaluate_during_training + ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + tb_writer.add_scalar("eval_{}".format(key), value, global_step) + tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) + tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint - output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) - torch.save(args, os.path.join(output_dir, 'training_args.bin')) + torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: @@ -224,32 +254,31 @@ def evaluate(args, model, tokenizer, prefix=""): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1] - } - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids + inputs = {"input_ids": batch[0], "attention_mask": batch[1]} + if args.model_type != "distilbert": + inputs["token_type_ids"] = None if args.model_type == "xlm" else batch[2] # XLM don't use segment_ids example_indices = batch[3] - if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[4], - 'p_mask': batch[5]}) + if args.model_type in ["xlnet", "xlm"]: + inputs.update({"cls_index": batch[4], "p_mask": batch[5]}) outputs = model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args.model_type in ['xlnet', 'xlm']: + if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure - result = RawResultExtended(unique_id = unique_id, - start_top_log_probs = to_list(outputs[0][i]), - start_top_index = to_list(outputs[1][i]), - end_top_log_probs = to_list(outputs[2][i]), - end_top_index = to_list(outputs[3][i]), - cls_logits = to_list(outputs[4][i])) + result = RawResultExtended( + unique_id=unique_id, + start_top_log_probs=to_list(outputs[0][i]), + start_top_index=to_list(outputs[1][i]), + end_top_log_probs=to_list(outputs[2][i]), + end_top_index=to_list(outputs[3][i]), + cls_logits=to_list(outputs[4][i]), + ) else: - result = RawResult(unique_id = unique_id, - start_logits = to_list(outputs[0][i]), - end_logits = to_list(outputs[1][i])) + result = RawResult( + unique_id=unique_id, start_logits=to_list(outputs[0][i]), end_logits=to_list(outputs[1][i]) + ) all_results.append(result) # Compute predictions @@ -260,23 +289,44 @@ def evaluate(args, model, tokenizer, prefix=""): else: output_null_log_odds_file = None - if args.model_type in ['xlnet', 'xlm']: + if args.model_type in ["xlnet", "xlm"]: # XLNet uses a more complex post-processing procedure - write_predictions_extended(examples, features, all_results, args.n_best_size, - args.max_answer_length, output_prediction_file, - output_nbest_file, output_null_log_odds_file, args.predict_file, - model.config.start_n_top, model.config.end_n_top, - args.version_2_with_negative, tokenizer, args.verbose_logging) + write_predictions_extended( + examples, + features, + all_results, + args.n_best_size, + args.max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + args.predict_file, + model.config.start_n_top, + model.config.end_n_top, + args.version_2_with_negative, + tokenizer, + args.verbose_logging, + ) else: - write_predictions(examples, features, all_results, args.n_best_size, - args.max_answer_length, args.do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold) + write_predictions( + examples, + features, + all_results, + args.n_best_size, + args.max_answer_length, + args.do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + args.verbose_logging, + args.version_2_with_negative, + args.null_score_diff_threshold, + ) # Evaluate with the official SQuAD script - evaluate_options = EVAL_OPTS(data_file=args.predict_file, - pred_file=output_prediction_file, - na_prob_file=output_null_log_odds_file) + evaluate_options = EVAL_OPTS( + data_file=args.predict_file, pred_file=output_prediction_file, na_prob_file=output_null_log_odds_file + ) results = evaluate_on_squad(evaluate_options) return results @@ -287,24 +337,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file - cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( - 'dev' if evaluate else 'train', - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length))) + cached_features_file = os.path.join( + os.path.dirname(input_file), + "cached_{}_{}_{}".format( + "dev" if evaluate else "train", + list(filter(None, args.model_name_or_path.split("/"))).pop(), + str(args.max_seq_length), + ), + ) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) - examples = read_squad_examples(input_file=input_file, - is_training=not evaluate, - version_2_with_negative=args.version_2_with_negative) - features = convert_examples_to_features(examples=examples, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=not evaluate) + examples = read_squad_examples( + input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative + ) + features = convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) @@ -320,14 +376,21 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) + dataset = TensorDataset( + all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask + ) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask) + dataset = TensorDataset( + all_input_ids, + all_input_mask, + all_segment_ids, + all_start_positions, + all_end_positions, + all_cls_index, + all_p_mask, + ) if output_examples: return dataset, examples, features @@ -338,109 +401,190 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--train_file", default=None, type=str, required=True, - help="SQuAD json for training. E.g., train-v1.1.json") - parser.add_argument("--predict_file", default=None, type=str, required=True, - help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") - parser.add_argument("--model_type", default=None, type=str, required=True, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) - parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) - parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model checkpoints and predictions will be written.") + parser.add_argument( + "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json" + ) + parser.add_argument( + "--predict_file", + default=None, + type=str, + required=True, + help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json", + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints and predictions will be written.", + ) ## Other parameters - parser.add_argument("--config_name", default="", type=str, - help="Pretrained config name or path if not the same as model_name") - parser.add_argument("--tokenizer_name", default="", type=str, - help="Pretrained tokenizer name or path if not the same as model_name") - parser.add_argument("--cache_dir", default="", type=str, - help="Where do you want to store the pre-trained models downloaded from s3") + parser.add_argument( + "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--cache_dir", + default="", + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) - parser.add_argument('--version_2_with_negative', action='store_true', - help='If true, the SQuAD examples contain some that do not have an answer.') - parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, - help="If null_score - best_non_null is greater than the threshold predict null.") + parser.add_argument( + "--version_2_with_negative", + action="store_true", + help="If true, the SQuAD examples contain some that do not have an answer.", + ) + parser.add_argument( + "--null_score_diff_threshold", + type=float, + default=0.0, + help="If null_score - best_non_null is greater than the threshold predict null.", + ) - parser.add_argument("--max_seq_length", default=384, type=int, - help="The maximum total input sequence length after WordPiece tokenization. Sequences " - "longer than this will be truncated, and sequences shorter than this will be padded.") - parser.add_argument("--doc_stride", default=128, type=int, - help="When splitting up a long document into chunks, how much stride to take between chunks.") - parser.add_argument("--max_query_length", default=64, type=int, - help="The maximum number of tokens for the question. Questions longer than this will " - "be truncated to this length.") - parser.add_argument("--do_train", action='store_true', - help="Whether to run training.") - parser.add_argument("--do_eval", action='store_true', - help="Whether to run eval on the dev set.") - parser.add_argument("--evaluate_during_training", action='store_true', - help="Rul evaluation during training at each logging step.") - parser.add_argument("--do_lower_case", action='store_true', - help="Set this flag if you are using an uncased model.") + parser.add_argument( + "--max_seq_length", + default=384, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. Sequences " + "longer than this will be truncated, and sequences shorter than this will be padded.", + ) + parser.add_argument( + "--doc_stride", + default=128, + type=int, + help="When splitting up a long document into chunks, how much stride to take between chunks.", + ) + parser.add_argument( + "--max_query_length", + default=64, + type=int, + help="The maximum number of tokens for the question. Questions longer than this will " + "be truncated to this length.", + ) + parser.add_argument("--do_train", action="store_true", help="Whether to run training.") + parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") + parser.add_argument( + "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." + ) - parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for training.") - parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, - help="Batch size per GPU/CPU for evaluation.") - parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") - parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumulate before performing a backward/update pass.") - parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-8, type=float, - help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, - help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=3.0, type=float, - help="Total number of training epochs to perform.") - parser.add_argument("--max_steps", default=-1, type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.") - parser.add_argument("--warmup_steps", default=0, type=int, - help="Linear warmup over warmup_steps.") - parser.add_argument("--n_best_size", default=20, type=int, - help="The total number of n-best predictions to generate in the nbest_predictions.json output file.") - parser.add_argument("--max_answer_length", default=30, type=int, - help="The maximum length of an answer that can be generated. This is needed because the start " - "and end predictions are not conditioned on one another.") - parser.add_argument("--verbose_logging", action='store_true', - help="If true, all of the warnings related to data processing will be printed. " - "A number of warnings are expected for a normal SQuAD evaluation.") + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") + parser.add_argument( + "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." + ) + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument( + "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." + ) + parser.add_argument( + "--max_steps", + default=-1, + type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.", + ) + parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") + parser.add_argument( + "--n_best_size", + default=20, + type=int, + help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", + ) + parser.add_argument( + "--max_answer_length", + default=30, + type=int, + help="The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another.", + ) + parser.add_argument( + "--verbose_logging", + action="store_true", + help="If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.", + ) - parser.add_argument('--logging_steps', type=int, default=50, - help="Log every X updates steps.") - parser.add_argument('--save_steps', type=int, default=50, - help="Save checkpoint every X updates steps.") - parser.add_argument("--eval_all_checkpoints", action='store_true', - help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") - parser.add_argument("--no_cuda", action='store_true', - help="Whether not to use CUDA when available") - parser.add_argument('--overwrite_output_dir', action='store_true', - help="Overwrite the content of the output directory") - parser.add_argument('--overwrite_cache', action='store_true', - help="Overwrite the cached training and evaluation sets") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") + parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") + parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") + parser.add_argument( + "--eval_all_checkpoints", + action="store_true", + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", + ) + parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument("--local_rank", type=int, default=-1, - help="local_rank for distributed training on gpus") - parser.add_argument('--fp16', action='store_true', - help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") - parser.add_argument('--fp16_opt_level', type=str, default='O1', - help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." - "See details at https://nvidia.github.io/apex/amp.html") - parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") - parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") + parser.add_argument( + "--fp16", + action="store_true", + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", + ) + parser.add_argument( + "--fp16_opt_level", + type=str, + default="O1", + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html", + ) + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + if ( + os.path.exists(args.output_dir) + and os.listdir(args.output_dir) + and args.do_train + and not args.overwrite_output_dir + ): + raise ValueError( + "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( + args.output_dir + ) + ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd + print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() @@ -452,16 +596,24 @@ def main(): else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) - torch.distributed.init_process_group(backend='nccl') + torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging - logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", - args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, + device, + args.n_gpu, + bool(args.local_rank != -1), + args.fp16, + ) # Set seed set_seed(args) @@ -472,15 +624,21 @@ def main(): args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, - cache_dir=args.cache_dir if args.cache_dir else None) - tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, - do_lower_case=args.do_lower_case, - cache_dir=args.cache_dir if args.cache_dir else None) - model = model_class.from_pretrained(args.model_name_or_path, - from_tf=bool('.ckpt' in args.model_name_or_path), - config=config, - cache_dir=args.cache_dir if args.cache_dir else None) + config = config_class.from_pretrained( + args.config_name if args.config_name else args.model_name_or_path, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + tokenizer = tokenizer_class.from_pretrained( + args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, + do_lower_case=args.do_lower_case, + cache_dir=args.cache_dir if args.cache_dir else None, + ) + model = model_class.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + cache_dir=args.cache_dir if args.cache_dir else None, + ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab @@ -495,7 +653,8 @@ def main(): if args.fp16: try: import apex - apex.amp.register_half_function(torch, 'einsum') + + apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") @@ -505,7 +664,6 @@ def main(): global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed @@ -515,39 +673,42 @@ def main(): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save = ( + model.module if hasattr(model, "module") else model + ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model - torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) - # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + checkpoints = list( + os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) + ) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model - global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) - result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) + result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py index 3f4145e028..bd016bd306 100644 --- a/templates/adding_a_new_example_script/utils_xxx.py +++ b/templates/adding_a_new_example_script/utils_xxx.py @@ -1,4 +1,3 @@ - # coding=utf-8 # Copyright 2018 XXX. All rights reserved. # @@ -37,14 +36,16 @@ class SquadExample(object): For examples without an answer, the start and end position are -1. """ - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None): + def __init__( + self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=None, + ): self.qas_id = qas_id self.question_text = question_text self.doc_tokens = doc_tokens @@ -59,8 +60,7 @@ class SquadExample(object): def __repr__(self): s = "" s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % ( - self.question_text) + s += ", question_text: %s" % (self.question_text) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) @@ -74,22 +74,24 @@ class SquadExample(object): class InputFeatures(object): """A single set of features of data.""" - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - cls_index, - p_mask, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None): + def __init__( + self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + cls_index, + p_mask, + paragraph_len, + start_position=None, + end_position=None, + is_impossible=None, + ): self.unique_id = unique_id self.example_index = example_index self.doc_span_index = doc_span_index @@ -109,7 +111,7 @@ class InputFeatures(object): def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r", encoding='utf-8') as reader: + with open(input_file, "r", encoding="utf-8") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): @@ -146,8 +148,7 @@ def read_squad_examples(input_file, is_training, version_2_with_negative): if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer.") + raise ValueError("For training, each question should have exactly 1 answer.") if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] @@ -161,12 +162,10 @@ def read_squad_examples(input_file, is_training, version_2_with_negative): # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text)) + actual_text = " ".join(doc_tokens[start_position : (end_position + 1)]) + cleaned_answer_text = " ".join(whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 @@ -180,18 +179,29 @@ def read_squad_examples(input_file, is_training, version_2_with_negative): orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, - is_impossible=is_impossible) + is_impossible=is_impossible, + ) examples.append(example) return examples -def convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - cls_token_at_end=False, - cls_token='[CLS]', sep_token='[SEP]', pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True): +def convert_examples_to_features( + examples, + tokenizer, + max_seq_length, + doc_stride, + max_query_length, + is_training, + cls_token_at_end=False, + cls_token="[CLS]", + sep_token="[SEP]", + pad_token=0, + sequence_a_segment_id=0, + sequence_b_segment_id=1, + cls_token_segment_id=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -232,8 +242,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text + ) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 @@ -241,8 +251,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) + _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) # pylint: disable=invalid-name doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): @@ -287,8 +296,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, split_token_index = doc_span.start + i token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) + is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) segment_ids.append(sequence_b_segment_id) @@ -333,8 +341,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): out_of_span = True if out_of_span: start_position = 0 @@ -355,24 +362,23 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("tokens: %s" % " ".join(tokens)) - logger.info("token_to_orig_map: %s" % " ".join([ - "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - logger.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - ])) + logger.info( + "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]) + ) + logger.info( + "token_is_max_context: %s" + % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]) + ) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if is_training and span_is_impossible: logger.info("impossible example") if is_training and not span_is_impossible: - answer_text = " ".join(tokens[start_position:(end_position + 1)]) + answer_text = " ".join(tokens[start_position : (end_position + 1)]) logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) - logger.info( - "answer: %s" % (answer_text)) + logger.info("answer: %s" % (answer_text)) features.append( InputFeatures( @@ -390,14 +396,15 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, paragraph_len=paragraph_len, start_position=start_position, end_position=end_position, - is_impossible=span_is_impossible)) + is_impossible=span_is_impossible, + ) + ) unique_id += 1 return features -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to @@ -426,7 +433,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) if text_span == tok_answer_text: return (new_start, new_end) @@ -470,13 +477,23 @@ def _check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index -RawResult = collections.namedtuple("RawResult", - ["unique_id", "start_logits", "end_logits"]) +RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) -def write_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): + +def write_predictions( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + verbose_logging, + version_2_with_negative, + null_score_diff_threshold, +): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -490,8 +507,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] + ) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() @@ -544,7 +561,9 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) + end_logit=result.end_logits[end_index], + ) + ) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( @@ -552,14 +571,14 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, start_index=0, end_index=0, start_logit=null_start_logit, - end_logit=null_end_logit)) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) + end_logit=null_end_logit, + ) + ) + prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) + "NbestPrediction", ["text", "start_logit", "end_logit"] + ) seen_predictions = {} nbest = [] @@ -568,10 +587,10 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. @@ -592,31 +611,21 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, final_text = "" seen_predictions[final_text] = True - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) + nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: - nbest.append( - _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) - + nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) + # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. - if len(nbest)==1: - nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + if len(nbest) == 1: + nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 @@ -645,8 +654,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) + score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" @@ -668,29 +676,40 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, # For XLNet (and XLM which uses the same head) -RawResultExtended = collections.namedtuple("RawResultExtended", - ["unique_id", "start_top_log_probs", "start_top_index", - "end_top_log_probs", "end_top_index", "cls_logits"]) +RawResultExtended = collections.namedtuple( + "RawResultExtended", + ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits"], +) -def write_predictions_extended(all_examples, all_features, all_results, n_best_size, - max_answer_length, output_prediction_file, - output_nbest_file, - output_null_log_odds_file, orig_data_file, - start_n_top, end_n_top, version_2_with_negative, - tokenizer, verbose_logging): +def write_predictions_extended( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + orig_data_file, + start_n_top, + end_n_top, + version_2_with_negative, + tokenizer, + verbose_logging, +): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"] + ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] + ) logger.info("Writing predictions to: %s", output_prediction_file) # logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -754,12 +773,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, - end_log_prob=end_log_prob)) + end_log_prob=end_log_prob, + ) + ) prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True) + prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True + ) seen_predictions = {} nbest = [] @@ -770,7 +790,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s # XLNet un-tokenizer # Let's keep it simple for now and see if we need all this later. - # + # # tok_start_to_orig_index = feature.tok_start_to_orig_index # tok_end_to_orig_index = feature.tok_end_to_orig_index # start_orig_pos = tok_start_to_orig_index[pred.start_index] @@ -779,10 +799,10 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace @@ -790,8 +810,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, - verbose_logging) + final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging) if final_text in seen_predictions: continue @@ -799,17 +818,13 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s seen_predictions[final_text] = True nbest.append( - _NbestPrediction( - text=final_text, - start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob)) + _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob) + ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, - end_log_prob=-1e6)) + nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6)) total_scores = [] best_non_null_entry = None @@ -850,7 +865,7 @@ def write_predictions_extended(all_examples, all_features, all_results, n_best_s with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - with open(orig_data_file, "r", encoding='utf-8') as reader: + with open(orig_data_file, "r", encoding="utf-8") as reader: orig_data = json.load(reader)["data"] qid_to_has_ans = make_qid_to_has_ans(orig_data) @@ -914,8 +929,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 @@ -924,8 +938,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using @@ -956,7 +969,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): logger.info("Couldn't map end position") return orig_text - output_text = orig_text[orig_start_position:(orig_end_position + 1)] + output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py index 12d69799a9..370fbb569f 100644 --- a/templates/adding_a_new_model/configuration_xxx.py +++ b/templates/adding_a_new_model/configuration_xxx.py @@ -27,8 +27,8 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json", - 'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json", + "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json", + "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json", } @@ -63,24 +63,26 @@ class XxxConfig(PretrainedConfig): """ pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=50257, - n_positions=1024, - n_ctx=1024, - n_embd=768, - n_layer=12, - n_head=12, - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - summary_type='cls_index', - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - **kwargs): + def __init__( + self, + vocab_size=50257, + n_positions=1024, + n_ctx=1024, + n_embd=768, + n_layer=12, + n_head=12, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + summary_type="cls_index", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + **kwargs + ): super(XxxConfig, self).__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py index 9d389deaad..99d3761496 100755 --- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py +++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py @@ -24,8 +24,10 @@ import torch from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx import logging + logging.basicConfig(level=logging.INFO) + def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = XxxConfig.from_json_file(config_file) @@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--tf_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--config_file", - default = None, - type = str, - required = True, - help = "The config json file corresponding to the pre-trained model. \n" - "This specifies the model architecture.") - parser.add_argument("--pytorch_dump_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the pre-trained model. \n" + "This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.config_file, - args.pytorch_dump_path) + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py index 1783620998..a4477704ae 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/modeling_tf_xxx.py @@ -44,8 +44,8 @@ logger = logging.getLogger(__name__) # for the pretrained weights provided with the models #################################################### TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5", - 'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5", + "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5", + "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5", } #################################################### @@ -69,9 +69,9 @@ TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = { class TFXxxLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFXxxLayer, self).__init__(**kwargs) - self.attention = TFXxxAttention(config, name='attention') - self.intermediate = TFXxxIntermediate(config, name='intermediate') - self.transformer_output = TFXxxOutput(config, name='output') + self.attention = TFXxxAttention(config, name="attention") + self.intermediate = TFXxxIntermediate(config, name="intermediate") + self.transformer_output = TFXxxOutput(config, name="output") def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs @@ -98,7 +98,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer): def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): + def call( + self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False + ): # We allow three types of multi-inputs: # - traditional keyword arguments in the call method # - all the arguments provided as a dict in the first positional argument of call @@ -113,11 +115,11 @@ class TFXxxMainLayer(tf.keras.layers.Layer): head_mask = inputs[4] if len(inputs) > 4 else head_mask assert len(inputs) <= 5, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) assert len(inputs) <= 5, "Too many inputs." else: input_ids = inputs @@ -175,6 +177,7 @@ class TFXxxPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = XxxConfig pretrained_model_archive_map = TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -263,8 +266,12 @@ XXX_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.", - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.", + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class TFXxxModel(TFXxxPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -297,17 +304,19 @@ class TFXxxModel(TFXxxPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFXxxModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXxxMainLayer(config, name='transformer') + self.transformer = TFXxxMainLayer(config, name="transformer") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) return outputs -@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) +@add_start_docstrings( + """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING +) class TFXxxForMaskedLM(TFXxxPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -333,26 +342,30 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel): prediction_scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXxxMainLayer(config, name='transformer') - self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm') + self.transformer = TFXxxMainLayer(config, name="transformer") + self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False)) + prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions) -@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class TFXxxForSequenceClassification(TFXxxPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -378,22 +391,23 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.transformer = TFXxxMainLayer(config, name='transformer') + self.transformer = TFXxxMainLayer(config, name="transformer") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) + pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here @@ -401,9 +415,12 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel): return outputs # logits, (hidden_states), (attentions) -@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """Xxx Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class TFXxxForTokenClassification(TFXxxPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -429,22 +446,23 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel): scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.transformer = TFXxxMainLayer(config, name='transformer') + self.transformer = TFXxxMainLayer(config, name="transformer") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False)) + sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here @@ -452,9 +470,12 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel): return outputs # scores, (hidden_states), (attentions) -@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class TFXxxForQuestionAnswering(TFXxxPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -482,14 +503,15 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel): start_scores, end_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.transformer = TFXxxMainLayer(config, name='transformer') - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='qa_outputs') + self.transformer = TFXxxMainLayer(config, name="transformer") + self.qa_outputs = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index 4c325196eb..7270376ec7 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -44,8 +44,8 @@ logger = logging.getLogger(__name__) # for the pretrained weights provided with the models #################################################### XXX_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin", - 'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin", + "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin", + "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin", } #################################################### @@ -60,8 +60,10 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): import numpy as np import tensorflow as tf except ImportError: - logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) @@ -76,7 +78,7 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): arrays.append(array) for name, array in zip(names, arrays): - name = name.split('/') + name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m", "global_step"] for n in name): @@ -84,18 +86,18 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): continue pointer = model for m_name in name: - if re.fullmatch(r'[A-Za-z]+_\d+', m_name): - l = re.split(r'_(\d+)', m_name) + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + l = re.split(r"_(\d+)", m_name) else: l = [m_name] - if l[0] == 'kernel' or l[0] == 'gamma': - pointer = getattr(pointer, 'weight') - elif l[0] == 'output_bias' or l[0] == 'beta': - pointer = getattr(pointer, 'bias') - elif l[0] == 'output_weights': - pointer = getattr(pointer, 'weight') - elif l[0] == 'squad': - pointer = getattr(pointer, 'classifier') + if l[0] == "kernel" or l[0] == "gamma": + pointer = getattr(pointer, "weight") + elif l[0] == "output_bias" or l[0] == "beta": + pointer = getattr(pointer, "bias") + elif l[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif l[0] == "squad": + pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, l[0]) @@ -105,9 +107,9 @@ def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): if len(l) >= 2: num = int(l[1]) pointer = pointer[num] - if m_name[-11:] == '_embeddings': - pointer = getattr(pointer, 'weight') - elif m_name == 'kernel': + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape @@ -147,7 +149,6 @@ class XxxLayer(nn.Module): return outputs - #################################################### # PreTrainedModel is a sub-class of torch.nn.Module # which take care of loading and saving pretrained weights @@ -161,6 +162,7 @@ class XxxPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = XxxConfig pretrained_model_archive_map = XXX_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_xxx @@ -246,8 +248,12 @@ XXX_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.", - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.", + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class XxxModel(XxxPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -277,6 +283,7 @@ class XxxModel(XxxPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(XxxModel, self).__init__(config) @@ -300,7 +307,15 @@ class XxxModel(XxxPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -329,7 +344,7 @@ class XxxModel(XxxPreTrainedModel): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 # Prepare head mask if needed @@ -342,14 +357,20 @@ class XxxModel(XxxPreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.num_hidden_layers ################################## # Replace this with your model code - embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) + embedding_output = self.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) sequence_output = encoder_outputs[0] outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here @@ -357,8 +378,9 @@ class XxxModel(XxxPreTrainedModel): return outputs # sequence_output, (hidden_states), (attentions) -@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) +@add_start_docstrings( + """Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING +) class XxxForMaskedLM(XxxPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -389,6 +411,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): loss, prediction_scores = outputs[:2] """ + def __init__(self, config): super(XxxForMaskedLM, self).__init__(config) @@ -400,15 +423,25 @@ class XxxForMaskedLM(XxxPreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + masked_lm_labels=None, + ): - outputs = self.transformer(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) @@ -422,9 +455,12 @@ class XxxForMaskedLM(XxxPreTrainedModel): return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) -@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class XxxForSequenceClassification(XxxPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -456,6 +492,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(XxxForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels @@ -466,15 +503,25 @@ class XxxForSequenceClassification(XxxPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, - position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): - outputs = self.transformer(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) pooled_output = outputs[1] @@ -496,9 +543,12 @@ class XxxForSequenceClassification(XxxPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) -@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """Xxx Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class XxxForTokenClassification(XxxPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -528,6 +578,7 @@ class XxxForTokenClassification(XxxPreTrainedModel): loss, scores = outputs[:2] """ + def __init__(self, config): super(XxxForTokenClassification, self).__init__(config) self.num_labels = config.num_labels @@ -538,15 +589,25 @@ class XxxForTokenClassification(XxxPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, - position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): - outputs = self.transformer(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] @@ -569,9 +630,12 @@ class XxxForTokenClassification(XxxPreTrainedModel): return outputs # (loss), scores, (hidden_states), (attentions) -@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) + XXX_START_DOCSTRING, + XXX_INPUTS_DOCSTRING, +) class XxxForQuestionAnswering(XxxPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -613,6 +677,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): """ + def __init__(self, config): super(XxxForQuestionAnswering, self).__init__(config) self.num_labels = config.num_labels @@ -622,15 +687,26 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - start_positions=None, end_positions=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + ): - outputs = self.transformer(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py index 6eba932a8e..1e4f64042a 100644 --- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py +++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import sys -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,46 +27,57 @@ from transformers import XxxConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_xxx import (TFXxxModel, TFXxxForMaskedLM, - TFXxxForSequenceClassification, - TFXxxForTokenClassification, - TFXxxForQuestionAnswering, - TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_xxx import ( + TFXxxModel, + TFXxxForMaskedLM, + TFXxxForSequenceClassification, + TFXxxForTokenClassification, + TFXxxForQuestionAnswering, + TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP, + ) @require_tf class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering, - TFXxxForSequenceClassification, - TFXxxForTokenClassification) if is_tf_available() else () + all_model_classes = ( + ( + TFXxxModel, + TFXxxForMaskedLM, + TFXxxForQuestionAnswering, + TFXxxForSequenceClassification, + TFXxxForTokenClassification, + ) + if is_tf_available() + else () + ) class TFXxxModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -120,15 +131,16 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester): attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFXxxModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} sequence_output, pooled_output = model(inputs) inputs = [input_ids, input_mask] @@ -141,78 +153,74 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester): "pooled_output": pooled_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) - - def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFXxxForMaskedLM(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - prediction_scores, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (prediction_scores,) = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - - def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = TFXxxForSequenceClassification(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - logits, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) result = { "logits": logits.numpy(), } - self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) - - def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = TFXxxForTokenClassification(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - logits, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) result = { "logits": logits.numpy(), } self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.seq_length, self.num_labels]) + list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] + ) - - def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFXxxForQuestionAnswering(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} start_logits, end_logits = model(inputs) result = { "start_logits": start_logits.numpy(), "end_logits": end_logits.numpy(), } - self.parent.assertListEqual( - list(result["start_logits"].shape), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].shape), - [self.batch_size, self.seq_length]) - + self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -244,9 +252,10 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - for model_name in ['xxx-base-uncased']: + for model_name in ["xxx-base-uncased"]: model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py index 5e22392d00..2043d79655 100644 --- a/templates/adding_a_new_model/tests/modeling_xxx_test.py +++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py @@ -20,51 +20,60 @@ import unittest from transformers import is_torch_available -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): - from transformers import (XxxConfig, XxxModel, XxxForMaskedLM, - XxxForNextSentencePrediction, XxxForPreTraining, - XxxForQuestionAnswering, XxxForSequenceClassification, - XxxForTokenClassification, XxxForMultipleChoice) + from transformers import ( + XxxConfig, + XxxModel, + XxxForMaskedLM, + XxxForNextSentencePrediction, + XxxForPreTraining, + XxxForQuestionAnswering, + XxxForSequenceClassification, + XxxForTokenClassification, + XxxForMultipleChoice, + ) from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP @require_torch class XxxModelTest(CommonTestCases.CommonModelTester): - all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, - XxxForSequenceClassification, - XxxForTokenClassification) if is_torch_available() else () + all_model_classes = ( + (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification) + if is_torch_available() + else () + ) class XxxModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -118,16 +127,17 @@ class XxxModelTest(CommonTestCases.CommonModelTester): attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = XxxModel(config=config) model.to(torch_device) model.eval() @@ -140,83 +150,98 @@ class XxxModelTest(CommonTestCases.CommonModelTester): "pooled_output": pooled_output, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) - - def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = XxxForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) + loss, prediction_scores = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels + ) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.check_loss_output(result) - - def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = XxxForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - start_positions=sequence_labels, end_positions=sequence_labels) + loss, start_logits, end_logits = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) result = { "loss": loss, "start_logits": start_logits, "end_logits": end_logits, } - self.parent.assertListEqual( - list(result["start_logits"].size()), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].size()), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) - - def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = XxxForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + loss, logits = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels + ) result = { "loss": loss, "logits": logits, } - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) - - def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_xxx_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = XxxForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + loss, logits = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels + ) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.seq_length, self.num_labels]) + list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels] + ) self.check_loss_output(result) - def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -252,5 +277,6 @@ class XxxModelTest(CommonTestCases.CommonModelTester): model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/templates/adding_a_new_model/tests/tokenization_xxx_test.py b/templates/adding_a_new_model/tests/tokenization_xxx_test.py index 116083edc8..940de5c769 100644 --- a/templates/adding_a_new_model/tests/tokenization_xxx_test.py +++ b/templates/adding_a_new_model/tests/tokenization_xxx_test.py @@ -18,10 +18,11 @@ import os import unittest from io import open -from transformers.tokenization_bert import (XxxTokenizer, VOCAB_FILES_NAMES) +from transformers.tokenization_bert import XxxTokenizer, VOCAB_FILES_NAMES from .tokenization_tests_commons import CommonTestCases + class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer_class = XxxTokenizer @@ -30,28 +31,39 @@ class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester): super(XxxTokenizationTest, self).setUp() vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing", ",", "low", "lowest", + "[UNK]", + "[CLS]", + "[SEP]", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_tokenizer(self, **kwargs): return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"UNwant\u00E9d,running" - output_text = u"unwanted, running" + input_text = "UNwant\u00E9d,running" + output_text = "unwanted, running" return input_text, output_text def test_full_tokenizer(self): tokenizer = self.tokenizer_class(self.vocab_file) - tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + tokens = tokenizer.tokenize("UNwant\u00E9d,running") self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index 7a10a41e5a..c1ea93a6d2 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -34,17 +34,16 @@ logger = logging.getLogger(__name__) # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances #################################################### -VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to pretrained vocabulary URL for all the model shortcut names. #################################################### PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt", - 'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt", + "vocab_file": { + "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt", + "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt", } } @@ -52,8 +51,8 @@ PRETRAINED_VOCAB_FILES_MAP = { # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'xxx-base-uncased': 512, - 'xxx-large-uncased': 512, + "xxx-base-uncased": 512, + "xxx-large-uncased": 512, } #################################################### @@ -62,8 +61,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { # To be used for checkpoint specific configurations. #################################################### PRETRAINED_INIT_CONFIGURATION = { - 'xxx-base-uncased': {'do_lower_case': True}, - 'xxx-large-uncased': {'do_lower_case': True}, + "xxx-base-uncased": {"do_lower_case": True}, + "xxx-large-uncased": {"do_lower_case": True}, } @@ -73,7 +72,7 @@ def load_vocab(vocab_file): with open(vocab_file, "r", encoding="utf-8") as reader: tokens = reader.readlines() for index, token in enumerate(tokens): - token = token.rstrip('\n') + token = token.rstrip("\n") vocab[token] = index return vocab @@ -93,9 +92,17 @@ class XxxTokenizer(PreTrainedTokenizer): pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, do_lower_case=True, - unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", - mask_token="[MASK]", **kwargs): + def __init__( + self, + vocab_file, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs + ): """Constructs a XxxTokenizer. Args: @@ -104,16 +111,22 @@ class XxxTokenizer(PreTrainedTokenizer): Whether to lower case the input Only has an effect when do_basic_tokenize=True """ - super(XxxTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, - pad_token=pad_token, cls_token=cls_token, - mask_token=mask_token, **kwargs) + super(XxxTokenizer, self).__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) self.vocab = load_vocab(vocab_file) @property @@ -142,7 +155,7 @@ class XxxTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ' '.join(tokens).replace(' ##', '').strip() + out_string = " ".join(tokens).replace(" ##", "").strip() return out_string def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -177,8 +190,10 @@ class XxxTokenizer(PreTrainedTokenizer): if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: @@ -204,15 +219,17 @@ class XxxTokenizer(PreTrainedTokenizer): """Save the tokenizer vocabulary to a directory or file.""" index = 0 if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: - logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file)) + logger.warning( + "Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file) + ) index = token_index - writer.write(token + u'\n') + writer.write(token + "\n") index += 1 return (vocab_file,) diff --git a/transformers/__init__.py b/transformers/__init__.py index 017fe476e7..318cd5ce4e 100755 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -6,8 +6,9 @@ __version__ = "2.3.0" # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493 try: import absl.logging - absl.logging.set_verbosity('info') - absl.logging.set_stderrthreshold('info') + + absl.logging.set_verbosity("info") + absl.logging.set_stderrthreshold("info") absl.logging._warn_preinit_stderr = False except: pass @@ -17,19 +18,41 @@ import logging logger = logging.getLogger(__name__) # pylint: disable=invalid-name # Files and general utilities -from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, - cached_path, add_start_docstrings, add_end_docstrings, - WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME, - is_tf_available, is_torch_available) +from .file_utils import ( + TRANSFORMERS_CACHE, + PYTORCH_TRANSFORMERS_CACHE, + PYTORCH_PRETRAINED_BERT_CACHE, + cached_path, + add_start_docstrings, + add_end_docstrings, + WEIGHTS_NAME, + TF2_WEIGHTS_NAME, + TF_WEIGHTS_NAME, + CONFIG_NAME, + MODEL_CARD_NAME, + is_tf_available, + is_torch_available, +) -from .data import (is_sklearn_available, - InputExample, InputFeatures, DataProcessor, - SingleSentenceClassificationProcessor, - glue_output_modes, glue_convert_examples_to_features, - glue_processors, glue_tasks_num_labels, - xnli_output_modes, xnli_processors, xnli_tasks_num_labels, - squad_convert_examples_to_features, SquadFeatures, - SquadExample, SquadV1Processor, SquadV2Processor) +from .data import ( + is_sklearn_available, + InputExample, + InputFeatures, + DataProcessor, + SingleSentenceClassificationProcessor, + glue_output_modes, + glue_convert_examples_to_features, + glue_processors, + glue_tasks_num_labels, + xnli_output_modes, + xnli_processors, + xnli_tasks_num_labels, + squad_convert_examples_to_features, + SquadFeatures, + SquadExample, + SquadV1Processor, + SquadV2Processor, +) if is_sklearn_available(): from .data import glue_compute_metrics, xnli_compute_metrics @@ -38,12 +61,12 @@ if is_sklearn_available(): from .modelcard import ModelCard # Tokenizers -from .tokenization_utils import (PreTrainedTokenizer) +from .tokenization_utils import PreTrainedTokenizer from .tokenization_auto import AutoTokenizer from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer from .tokenization_openai import OpenAIGPTTokenizer -from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) +from .tokenization_transfo_xl import TransfoXLTokenizer, TransfoXLCorpus from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_ctrl import CTRLTokenizer from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE @@ -75,143 +98,281 @@ from .configuration_mmbt import MMBTConfig # Modeling if is_torch_available(): - from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) - from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, - AutoModelWithLMHead, AutoModelForTokenClassification, ALL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_utils import PreTrainedModel, prune_layer, Conv1D + from .modeling_auto import ( + AutoModel, + AutoModelForSequenceClassification, + AutoModelForQuestionAnswering, + AutoModelWithLMHead, + AutoModelForTokenClassification, + ALL_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, - BertForMaskedLM, BertForNextSentencePrediction, - BertForSequenceClassification, BertForMultipleChoice, - BertForTokenClassification, BertForQuestionAnswering, - load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel, - OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, - load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, - AdaptiveEmbedding, - load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, - GPT2LMHeadModel, GPT2DoubleHeadsModel, - load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel, - CTRLLMHeadModel, - CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, - XLNetForSequenceClassification, XLNetForTokenClassification, - XLNetForMultipleChoice, XLNetForQuestionAnsweringSimple, - XLNetForQuestionAnswering, load_tf_weights_in_xlnet, - XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_xlm import (XLMPreTrainedModel , XLMModel, - XLMWithLMHeadModel, XLMForSequenceClassification, - XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, - XLM_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, - RobertaForSequenceClassification, RobertaForMultipleChoice, - RobertaForTokenClassification, RobertaForQuestionAnswering, - ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel, - DistilBertForSequenceClassification, DistilBertForQuestionAnswering, - DistilBertForTokenClassification, - DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_camembert import (CamembertForMaskedLM, CamembertModel, - CamembertForSequenceClassification, CamembertForMultipleChoice, - CamembertForTokenClassification, - CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_bert import ( + BertPreTrainedModel, + BertModel, + BertForPreTraining, + BertForMaskedLM, + BertForNextSentencePrediction, + BertForSequenceClassification, + BertForMultipleChoice, + BertForTokenClassification, + BertForQuestionAnswering, + load_tf_weights_in_bert, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_openai import ( + OpenAIGPTPreTrainedModel, + OpenAIGPTModel, + OpenAIGPTLMHeadModel, + OpenAIGPTDoubleHeadsModel, + load_tf_weights_in_openai_gpt, + OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_transfo_xl import ( + TransfoXLPreTrainedModel, + TransfoXLModel, + TransfoXLLMHeadModel, + AdaptiveEmbedding, + load_tf_weights_in_transfo_xl, + TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_gpt2 import ( + GPT2PreTrainedModel, + GPT2Model, + GPT2LMHeadModel, + GPT2DoubleHeadsModel, + load_tf_weights_in_gpt2, + GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_ctrl import CTRLPreTrainedModel, CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP + from .modeling_xlnet import ( + XLNetPreTrainedModel, + XLNetModel, + XLNetLMHeadModel, + XLNetForSequenceClassification, + XLNetForTokenClassification, + XLNetForMultipleChoice, + XLNetForQuestionAnsweringSimple, + XLNetForQuestionAnswering, + load_tf_weights_in_xlnet, + XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_xlm import ( + XLMPreTrainedModel, + XLMModel, + XLMWithLMHeadModel, + XLMForSequenceClassification, + XLMForQuestionAnswering, + XLMForQuestionAnsweringSimple, + XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_roberta import ( + RobertaForMaskedLM, + RobertaModel, + RobertaForSequenceClassification, + RobertaForMultipleChoice, + RobertaForTokenClassification, + RobertaForQuestionAnswering, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_distilbert import ( + DistilBertPreTrainedModel, + DistilBertForMaskedLM, + DistilBertModel, + DistilBertForSequenceClassification, + DistilBertForQuestionAnswering, + DistilBertForTokenClassification, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_camembert import ( + CamembertForMaskedLM, + CamembertModel, + CamembertForSequenceClassification, + CamembertForMultipleChoice, + CamembertForTokenClassification, + CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model - from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel, - load_tf_weights_in_t5, - T5_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_albert import (AlbertPreTrainedModel, AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, - AlbertForQuestionAnswering, - load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) - from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice, - XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification) + from .modeling_t5 import ( + T5PreTrainedModel, + T5Model, + T5WithLMHeadModel, + load_tf_weights_in_t5, + T5_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_albert import ( + AlbertPreTrainedModel, + AlbertModel, + AlbertForMaskedLM, + AlbertForSequenceClassification, + AlbertForQuestionAnswering, + load_tf_weights_in_albert, + ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_xlm_roberta import ( + XLMRobertaForMaskedLM, + XLMRobertaModel, + XLMRobertaForMultipleChoice, + XLMRobertaForSequenceClassification, + XLMRobertaForTokenClassification, + ) from .modeling_mmbt import ModalEmbeddings, MMBTModel, MMBTForClassification # Optimization - from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, - get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup) + from .optimization import ( + AdamW, + get_constant_schedule, + get_constant_schedule_with_warmup, + get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, + get_linear_schedule_with_warmup, + ) # TensorFlow if is_tf_available(): from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list - from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, - TFAutoModelWithLMHead, TFAutoModelForTokenClassification, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_auto import ( + TFAutoModel, + TFAutoModelForSequenceClassification, + TFAutoModelForQuestionAnswering, + TFAutoModelWithLMHead, + TFAutoModelForTokenClassification, + TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings, - TFBertModel, TFBertForPreTraining, - TFBertForMaskedLM, TFBertForNextSentencePrediction, - TFBertForSequenceClassification, TFBertForMultipleChoice, - TFBertForTokenClassification, TFBertForQuestionAnswering, - TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_bert import ( + TFBertPreTrainedModel, + TFBertMainLayer, + TFBertEmbeddings, + TFBertModel, + TFBertForPreTraining, + TFBertForMaskedLM, + TFBertForNextSentencePrediction, + TFBertForSequenceClassification, + TFBertForMultipleChoice, + TFBertForTokenClassification, + TFBertForQuestionAnswering, + TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_gpt2 import (TFGPT2PreTrainedModel, TFGPT2MainLayer, - TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel, - TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_gpt2 import ( + TFGPT2PreTrainedModel, + TFGPT2MainLayer, + TFGPT2Model, + TFGPT2LMHeadModel, + TFGPT2DoubleHeadsModel, + TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_openai import (TFOpenAIGPTPreTrainedModel, TFOpenAIGPTMainLayer, - TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, - TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_openai import ( + TFOpenAIGPTPreTrainedModel, + TFOpenAIGPTMainLayer, + TFOpenAIGPTModel, + TFOpenAIGPTLMHeadModel, + TFOpenAIGPTDoubleHeadsModel, + TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_transfo_xl import (TFTransfoXLPreTrainedModel, TFTransfoXLMainLayer, - TFTransfoXLModel, TFTransfoXLLMHeadModel, - TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_transfo_xl import ( + TFTransfoXLPreTrainedModel, + TFTransfoXLMainLayer, + TFTransfoXLModel, + TFTransfoXLLMHeadModel, + TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer, - TFXLNetModel, TFXLNetLMHeadModel, - TFXLNetForSequenceClassification, - TFXLNetForTokenClassification, - TFXLNetForQuestionAnsweringSimple, - TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_xlnet import ( + TFXLNetPreTrainedModel, + TFXLNetMainLayer, + TFXLNetModel, + TFXLNetLMHeadModel, + TFXLNetForSequenceClassification, + TFXLNetForTokenClassification, + TFXLNetForQuestionAnsweringSimple, + TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer, - TFXLMModel, TFXLMWithLMHeadModel, - TFXLMForSequenceClassification, - TFXLMForQuestionAnsweringSimple, - TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_xlm import ( + TFXLMPreTrainedModel, + TFXLMMainLayer, + TFXLMModel, + TFXLMWithLMHeadModel, + TFXLMForSequenceClassification, + TFXLMForQuestionAnsweringSimple, + TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer, - TFRobertaModel, TFRobertaForMaskedLM, - TFRobertaForSequenceClassification, - TFRobertaForTokenClassification, - TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_roberta import ( + TFRobertaPreTrainedModel, + TFRobertaMainLayer, + TFRobertaModel, + TFRobertaForMaskedLM, + TFRobertaForSequenceClassification, + TFRobertaForTokenClassification, + TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer, - TFDistilBertModel, TFDistilBertForMaskedLM, - TFDistilBertForSequenceClassification, - TFDistilBertForTokenClassification, - TFDistilBertForQuestionAnswering, - TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_distilbert import ( + TFDistilBertPreTrainedModel, + TFDistilBertMainLayer, + TFDistilBertModel, + TFDistilBertForMaskedLM, + TFDistilBertForSequenceClassification, + TFDistilBertForTokenClassification, + TFDistilBertForQuestionAnswering, + TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_ctrl import (TFCTRLPreTrainedModel, TFCTRLModel, - TFCTRLLMHeadModel, - TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_ctrl import ( + TFCTRLPreTrainedModel, + TFCTRLModel, + TFCTRLLMHeadModel, + TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_albert import (TFAlbertPreTrainedModel, TFAlbertModel, TFAlbertForMaskedLM, - TFAlbertForSequenceClassification, - TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_albert import ( + TFAlbertPreTrainedModel, + TFAlbertModel, + TFAlbertForMaskedLM, + TFAlbertForSequenceClassification, + TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) - from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel, - TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_t5 import TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP # Optimization - from .optimization_tf import (WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator) + from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator # TF 2.0 <=> PyTorch conversion utilities -from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name, - load_pytorch_checkpoint_in_tf2_model, - load_pytorch_weights_in_tf2_model, - load_pytorch_model_in_tf2_model, - load_tf2_checkpoint_in_pytorch_model, - load_tf2_weights_in_pytorch_model, - load_tf2_model_in_pytorch_model) +from .modeling_tf_pytorch_utils import ( + convert_tf_weight_name_to_pt_weight_name, + load_pytorch_checkpoint_in_tf2_model, + load_pytorch_weights_in_tf2_model, + load_pytorch_model_in_tf2_model, + load_tf2_checkpoint_in_pytorch_model, + load_tf2_weights_in_pytorch_model, + load_tf2_model_in_pytorch_model, +) # Pipelines -from .pipelines import pipeline, PipelineDataFormat, CsvPipelineDataFormat, JsonPipelineDataFormat, PipedPipelineDataFormat, \ - Pipeline, FeatureExtractionPipeline, QuestionAnsweringPipeline, NerPipeline, TextClassificationPipeline +from .pipelines import ( + pipeline, + PipelineDataFormat, + CsvPipelineDataFormat, + JsonPipelineDataFormat, + PipedPipelineDataFormat, + Pipeline, + FeatureExtractionPipeline, + QuestionAnsweringPipeline, + NerPipeline, + TextClassificationPipeline, +) if not is_tf_available() and not is_torch_available(): - logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found." - "Models won't be available and only tokenizers, configuration" - "and file/data utilities can be used.") + logger.warning( + "Neither PyTorch nor TensorFlow >= 2.0 have been found." + "Models won't be available and only tokenizers, configuration" + "and file/data utilities can be used." + ) diff --git a/transformers/__main__.py b/transformers/__main__.py index dd259b04ee..3cabdd4fff 100644 --- a/transformers/__main__.py +++ b/transformers/__main__.py @@ -1,16 +1,21 @@ # coding: utf8 + def main(): import sys + if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]: print( - "First argument to `transformers` command line interface should be one of: \n" - ">> convert serve train predict") + "First argument to `transformers` command line interface should be one of: \n" + ">> convert serve train predict" + ) if sys.argv[1] == "convert": from transformers.commands import convert + convert(sys.argv) elif sys.argv[1] == "train": from transformers.commands import train + train(sys.argv) elif sys.argv[1] == "serve": pass @@ -19,7 +24,6 @@ def main(): # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve []') # commands_parser = parser.add_subparsers(help='transformers-cli command helpers') - # # Register commands # ServeCommand.register_subcommand(commands_parser) @@ -33,5 +37,6 @@ def main(): # service = args.func(args) # service.run() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/transformers/commands/__init__.py b/transformers/commands/__init__.py index bbdd5655fc..13171f4285 100644 --- a/transformers/commands/__init__.py +++ b/transformers/commands/__init__.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from argparse import ArgumentParser + class BaseTransformersCLICommand(ABC): @staticmethod @abstractmethod diff --git a/transformers/commands/convert.py b/transformers/commands/convert.py index 55dbf53734..e358d8532f 100644 --- a/transformers/commands/convert.py +++ b/transformers/commands/convert.py @@ -11,12 +11,12 @@ def convert_command_factory(args: Namespace): Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint. :return: ServeCommand """ - return ConvertCommand(args.model_type, args.tf_checkpoint, args.pytorch_dump_output, - args.config, args.finetuning_task_name) + return ConvertCommand( + args.model_type, args.tf_checkpoint, args.pytorch_dump_output, args.config, args.finetuning_task_name + ) class ConvertCommand(BaseTransformersCLICommand): - @staticmethod def register_subcommand(parser: ArgumentParser): """ @@ -24,25 +24,39 @@ class ConvertCommand(BaseTransformersCLICommand): :param parser: Root parser to register command-specific arguments :return: """ - train_parser = parser.add_parser('convert', help="CLI tool to run convert model from original " - "author checkpoints to Transformesr PyTorch checkpoints.") - train_parser.add_argument('--model_type', type=str, required=True, - help='Model\'s type.') - train_parser.add_argument('--tf_checkpoint', type=str, required=True, - help='TensorFlow checkpoint path or folder.') - train_parser.add_argument('--pytorch_dump_output', type=str, required=True, - help='Path to the PyTorch savd model output.') - train_parser.add_argument('--config', type=str, default="", - help='Configuration file path or folder.') - train_parser.add_argument('--finetuning_task_name', type=str, default=None, - help='Optional fine-tuning task name if the TF model was a finetuned model.') + train_parser = parser.add_parser( + "convert", + help="CLI tool to run convert model from original " + "author checkpoints to Transformesr PyTorch checkpoints.", + ) + train_parser.add_argument("--model_type", type=str, required=True, help="Model's type.") + train_parser.add_argument( + "--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder." + ) + train_parser.add_argument( + "--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output." + ) + train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.") + train_parser.add_argument( + "--finetuning_task_name", + type=str, + default=None, + help="Optional fine-tuning task name if the TF model was a finetuned model.", + ) train_parser.set_defaults(func=convert_command_factory) - def __init__(self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str, - config: str, finetuning_task_name: str, *args): - self._logger = getLogger('transformers-cli/converting') + def __init__( + self, + model_type: str, + tf_checkpoint: str, + pytorch_dump_output: str, + config: str, + finetuning_task_name: str, + *args + ): + self._logger = getLogger("transformers-cli/converting") - self._logger.info('Loading model {}'.format(model_type)) + self._logger.info("Loading model {}".format(model_type)) self._model_type = model_type self._tf_checkpoint = tf_checkpoint self._pytorch_dump_output = pytorch_dump_output @@ -52,63 +66,80 @@ class ConvertCommand(BaseTransformersCLICommand): def run(self): if self._model_type == "bert": try: - from transformers.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch + from transformers.convert_bert_original_tf_checkpoint_to_pytorch import ( + convert_tf_checkpoint_to_pytorch, + ) except ImportError: - msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ - "In that case, it requires TensorFlow to be installed. Please see " \ + msg = ( + "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." + ) raise ImportError(msg) convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "gpt": - from transformers.convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch - convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, - self._config, - self._pytorch_dump_output) + from transformers.convert_openai_original_tf_checkpoint_to_pytorch import ( + convert_openai_checkpoint_to_pytorch, + ) + + convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "transfo_xl": try: - from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch + from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import ( + convert_transfo_xl_checkpoint_to_pytorch, + ) except ImportError: - msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ - "In that case, it requires TensorFlow to be installed. Please see " \ + msg = ( + "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." + ) raise ImportError(msg) - if 'ckpt' in self._tf_checkpoint.lower(): + if "ckpt" in self._tf_checkpoint.lower(): TF_CHECKPOINT = self._tf_checkpoint TF_DATASET_FILE = "" else: TF_DATASET_FILE = self._tf_checkpoint TF_CHECKPOINT = "" - convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, - self._config, - self._pytorch_dump_output, - TF_DATASET_FILE) + convert_transfo_xl_checkpoint_to_pytorch( + TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE + ) elif self._model_type == "gpt2": try: - from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch + from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import ( + convert_gpt2_checkpoint_to_pytorch, + ) except ImportError: - msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ - "In that case, it requires TensorFlow to be installed. Please see " \ + msg = ( + "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." + ) raise ImportError(msg) convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) elif self._model_type == "xlnet": try: - from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch + from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import ( + convert_xlnet_checkpoint_to_pytorch, + ) except ImportError: - msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ - "In that case, it requires TensorFlow to be installed. Please see " \ + msg = ( + "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." + ) raise ImportError(msg) - convert_xlnet_checkpoint_to_pytorch(self._tf_checkpoint, - self._config, - self._pytorch_dump_output, - self._finetuning_task_name) + convert_xlnet_checkpoint_to_pytorch( + self._tf_checkpoint, self._config, self._pytorch_dump_output, self._finetuning_task_name + ) elif self._model_type == "xlm": - from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch + from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import ( + convert_xlm_checkpoint_to_pytorch, + ) convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output) else: diff --git a/transformers/commands/download.py b/transformers/commands/download.py index 0938f135d2..acfb3eeb92 100644 --- a/transformers/commands/download.py +++ b/transformers/commands/download.py @@ -8,13 +8,16 @@ def download_command_factory(args): class DownloadCommand(BaseTransformersCLICommand): - @staticmethod def register_subcommand(parser: ArgumentParser): - download_parser = parser.add_parser('download') - download_parser.add_argument('--cache-dir', type=str, default=None, help='Path to location to store the models') - download_parser.add_argument('--force', action='store_true', help='Force the model to be download even if already in cache-dir') - download_parser.add_argument('model', type=str, help='Name of the model to download') + download_parser = parser.add_parser("download") + download_parser.add_argument( + "--cache-dir", type=str, default=None, help="Path to location to store the models" + ) + download_parser.add_argument( + "--force", action="store_true", help="Force the model to be download even if already in cache-dir" + ) + download_parser.add_argument("model", type=str, help="Name of the model to download") download_parser.set_defaults(func=download_command_factory) def __init__(self, model: str, cache: str, force: bool): @@ -26,4 +29,4 @@ class DownloadCommand(BaseTransformersCLICommand): from transformers import AutoModel, AutoTokenizer AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) - AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) \ No newline at end of file + AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index df03cee9d7..6172263064 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -10,52 +10,72 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name def try_infer_format_from_ext(path: str): if not path: - return 'pipe' + return "pipe" for ext in PipelineDataFormat.SUPPORTED_FORMATS: if path.endswith(ext): return ext raise Exception( - 'Unable to determine file format from file extension {}. ' - 'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS) + "Unable to determine file format from file extension {}. " + "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS) ) def run_command_factory(args): - nlp = pipeline(task=args.task, - model=args.model if args.model else None, - config=args.config, - tokenizer=args.tokenizer, - device=args.device) - format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format - reader = PipelineDataFormat.from_str(format=format, - output_path=args.output, - input_path=args.input, - column=args.column if args.column else nlp.default_input_names, - overwrite=args.overwrite) + nlp = pipeline( + task=args.task, + model=args.model if args.model else None, + config=args.config, + tokenizer=args.tokenizer, + device=args.device, + ) + format = try_infer_format_from_ext(args.input) if args.format == "infer" else args.format + reader = PipelineDataFormat.from_str( + format=format, + output_path=args.output, + input_path=args.input, + column=args.column if args.column else nlp.default_input_names, + overwrite=args.overwrite, + ) return RunCommand(nlp, reader) class RunCommand(BaseTransformersCLICommand): - def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): self._nlp = nlp self._reader = reader @staticmethod def register_subcommand(parser: ArgumentParser): - run_parser = parser.add_parser('run', help="Run a pipeline through the CLI") - run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run') - run_parser.add_argument('--input', type=str, help='Path to the file to use for inference') - run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.') - run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.') - run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.') - run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') - run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') - run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from') - run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') - run_parser.add_argument('--overwrite', action='store_true', help='Allow overwriting the output file.') + run_parser = parser.add_parser("run", help="Run a pipeline through the CLI") + run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run") + run_parser.add_argument("--input", type=str, help="Path to the file to use for inference") + run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.") + run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.") + run_parser.add_argument("--config", type=str, help="Name or path to the model's config to instantiate.") + run_parser.add_argument( + "--tokenizer", type=str, help="Name of the tokenizer to use. (default: same as the model name)" + ) + run_parser.add_argument( + "--column", + type=str, + help="Name of the column to use as input. (For multi columns input as QA use column1,columns2)", + ) + run_parser.add_argument( + "--format", + type=str, + default="infer", + choices=PipelineDataFormat.SUPPORTED_FORMATS, + help="Input format to read from", + ) + run_parser.add_argument( + "--device", + type=int, + default=-1, + help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", + ) + run_parser.add_argument("--overwrite", action="store_true", help="Allow overwriting the output file.") run_parser.set_defaults(func=run_command_factory) def run(self): @@ -71,9 +91,6 @@ class RunCommand(BaseTransformersCLICommand): # Saving data if self._nlp.binary_output: binary_path = self._reader.save_binary(outputs) - logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path)) + logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path)) else: self._reader.save(outputs) - - - diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py index 4f41f797d1..f7729c0bf0 100644 --- a/transformers/commands/serving.py +++ b/transformers/commands/serving.py @@ -7,6 +7,7 @@ try: from uvicorn import run from fastapi import FastAPI, HTTPException, Body from pydantic import BaseModel + _serve_dependancies_installed = True except (ImportError, AttributeError): BaseModel = object @@ -17,18 +18,21 @@ from transformers import Pipeline from transformers.commands import BaseTransformersCLICommand from transformers.pipelines import SUPPORTED_TASKS, pipeline -logger = logging.getLogger('transformers-cli/serving') +logger = logging.getLogger("transformers-cli/serving") + def serve_command_factory(args: Namespace): """ Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ - nlp = pipeline(task=args.task, - model=args.model if args.model else None, - config=args.config, - tokenizer=args.tokenizer, - device=args.device) + nlp = pipeline( + task=args.task, + model=args.model if args.model else None, + config=args.config, + tokenizer=args.tokenizer, + device=args.device, + ) return ServeCommand(nlp, args.host, args.port) @@ -36,6 +40,7 @@ class ServeModelInfoResult(BaseModel): """ Expose model information """ + infos: dict @@ -43,6 +48,7 @@ class ServeTokenizeResult(BaseModel): """ Tokenize result model """ + tokens: List[str] tokens_ids: Optional[List[int]] @@ -51,6 +57,7 @@ class ServeDeTokenizeResult(BaseModel): """ DeTokenize result model """ + text: str @@ -58,11 +65,11 @@ class ServeForwardResult(BaseModel): """ Forward result model """ + output: Any class ServeCommand(BaseTransformersCLICommand): - @staticmethod def register_subcommand(parser: ArgumentParser): """ @@ -70,14 +77,23 @@ class ServeCommand(BaseTransformersCLICommand): :param parser: Root parser to register command-specific arguments :return: """ - serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.') - serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on') - serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.') - serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.') - serve_parser.add_argument('--model', type=str, help='Model\'s name or path to stored model.') - serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.') - serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.') - serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') + serve_parser = parser.add_parser( + "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints." + ) + serve_parser.add_argument( + "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on" + ) + serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.") + serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.") + serve_parser.add_argument("--model", type=str, help="Model's name or path to stored model.") + serve_parser.add_argument("--config", type=str, help="Model's config name or path to stored model.") + serve_parser.add_argument("--tokenizer", type=str, help="Tokenizer name to use.") + serve_parser.add_argument( + "--device", + type=int, + default=-1, + help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)", + ) serve_parser.set_defaults(func=serve_command_factory) def __init__(self, pipeline: Pipeline, host: str, port: int): @@ -87,18 +103,22 @@ class ServeCommand(BaseTransformersCLICommand): self._host = host self._port = port if not _serve_dependancies_installed: - raise ImportError("Using serve command requires FastAPI and unicorn. " - "Please install transformers with [serving]: pip install transformers[serving]." - "Or install FastAPI and unicorn separatly.") + raise ImportError( + "Using serve command requires FastAPI and unicorn. " + "Please install transformers with [serving]: pip install transformers[serving]." + "Or install FastAPI and unicorn separatly." + ) else: - logger.info('Serving model over {}:{}'.format(host, port)) + logger.info("Serving model over {}:{}".format(host, port)) self._app = FastAPI() # Register routes - self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET']) - self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST']) - self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST']) - self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST']) + self._app.add_api_route("/", self.model_info, response_model=ServeModelInfoResult, methods=["GET"]) + self._app.add_api_route("/tokenize", self.tokenize, response_model=ServeTokenizeResult, methods=["POST"]) + self._app.add_api_route( + "/detokenize", self.detokenize, response_model=ServeDeTokenizeResult, methods=["POST"] + ) + self._app.add_api_route("/forward", self.forward, response_model=ServeForwardResult, methods=["POST"]) def run(self): run(self._app, host=self._host, port=self._port) @@ -122,11 +142,14 @@ class ServeCommand(BaseTransformersCLICommand): return ServeTokenizeResult(tokens=tokens_txt) except Exception as e: - raise HTTPException(status_code=500, detail={"model": '', "error": str(e)}) + raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) - def detokenize(self, tokens_ids: List[int] = Body(None, embed=True), - skip_special_tokens: bool = Body(False, embed=True), - cleanup_tokenization_spaces: bool = Body(True, embed=True)): + def detokenize( + self, + tokens_ids: List[int] = Body(None, embed=True), + skip_special_tokens: bool = Body(False, embed=True), + cleanup_tokenization_spaces: bool = Body(True, embed=True), + ): """ Detokenize the provided tokens ids to readable text: - **tokens_ids**: List of tokens ids @@ -135,9 +158,9 @@ class ServeCommand(BaseTransformersCLICommand): """ try: decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces) - return ServeDeTokenizeResult(model='', text=decoded_str) + return ServeDeTokenizeResult(model="", text=decoded_str) except Exception as e: - raise HTTPException(status_code=500, detail={"model": '', "error": str(e)}) + raise HTTPException(status_code=500, detail={"model": "", "error": str(e)}) def forward(self, inputs: Union[str, dict, List[str], List[int], List[dict]] = Body(None, embed=True)): """ diff --git a/transformers/commands/train.py b/transformers/commands/train.py index 7b26745881..e51be71c75 100644 --- a/transformers/commands/train.py +++ b/transformers/commands/train.py @@ -3,9 +3,12 @@ from argparse import ArgumentParser, Namespace from logging import getLogger from transformers.commands import BaseTransformersCLICommand -from transformers import (is_tf_available, is_torch_available, - TextClassificationPipeline, - SingleSentenceClassificationProcessor as Processor) +from transformers import ( + is_tf_available, + is_torch_available, + TextClassificationPipeline, + SingleSentenceClassificationProcessor as Processor, +) if not is_tf_available() and not is_torch_available(): raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") @@ -14,6 +17,7 @@ if not is_tf_available() and not is_torch_available(): USE_XLA = False USE_AMP = False + def train_command_factory(args: Namespace): """ Factory function used to instantiate serving server from provided command line arguments. @@ -23,7 +27,6 @@ def train_command_factory(args: Namespace): class TrainCommand(BaseTransformersCLICommand): - @staticmethod def register_subcommand(parser: ArgumentParser): """ @@ -31,47 +34,54 @@ class TrainCommand(BaseTransformersCLICommand): :param parser: Root parser to register command-specific arguments :return: """ - train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.') + train_parser = parser.add_parser("train", help="CLI tool to train a model on a task.") - train_parser.add_argument('--train_data', type=str, required=True, - help="path to train (and optionally evaluation) dataset as a csv with " - "tab separated labels and sentences.") - train_parser.add_argument('--column_label', type=int, default=0, - help='Column of the dataset csv file with example labels.') - train_parser.add_argument('--column_text', type=int, default=1, - help='Column of the dataset csv file with example texts.') - train_parser.add_argument('--column_id', type=int, default=2, - help='Column of the dataset csv file with example ids.') - train_parser.add_argument('--skip_first_row', action='store_true', - help='Skip the first row of the csv file (headers).') + train_parser.add_argument( + "--train_data", + type=str, + required=True, + help="path to train (and optionally evaluation) dataset as a csv with " + "tab separated labels and sentences.", + ) + train_parser.add_argument( + "--column_label", type=int, default=0, help="Column of the dataset csv file with example labels." + ) + train_parser.add_argument( + "--column_text", type=int, default=1, help="Column of the dataset csv file with example texts." + ) + train_parser.add_argument( + "--column_id", type=int, default=2, help="Column of the dataset csv file with example ids." + ) + train_parser.add_argument( + "--skip_first_row", action="store_true", help="Skip the first row of the csv file (headers)." + ) - train_parser.add_argument('--validation_data', type=str, default='', - help='path to validation dataset.') - train_parser.add_argument('--validation_split', type=float, default=0.1, - help="if validation dataset is not provided, fraction of train dataset " - "to use as validation dataset.") + train_parser.add_argument("--validation_data", type=str, default="", help="path to validation dataset.") + train_parser.add_argument( + "--validation_split", + type=float, + default=0.1, + help="if validation dataset is not provided, fraction of train dataset " "to use as validation dataset.", + ) - train_parser.add_argument('--output', type=str, default='./', - help='path to saved the trained model.') + train_parser.add_argument("--output", type=str, default="./", help="path to saved the trained model.") - train_parser.add_argument('--task', type=str, default='text_classification', - help='Task to train the model on.') - train_parser.add_argument('--model', type=str, default='bert-base-uncased', - help='Model\'s name or path to stored model.') - train_parser.add_argument('--train_batch_size', type=int, default=32, - help='Batch size for training.') - train_parser.add_argument('--valid_batch_size', type=int, default=64, - help='Batch size for validation.') - train_parser.add_argument('--learning_rate', type=float, default=3e-5, - help="Learning rate.") - train_parser.add_argument('--adam_epsilon', type=float, default=1e-08, - help="Epsilon for Adam optimizer.") + train_parser.add_argument( + "--task", type=str, default="text_classification", help="Task to train the model on." + ) + train_parser.add_argument( + "--model", type=str, default="bert-base-uncased", help="Model's name or path to stored model." + ) + train_parser.add_argument("--train_batch_size", type=int, default=32, help="Batch size for training.") + train_parser.add_argument("--valid_batch_size", type=int, default=64, help="Batch size for validation.") + train_parser.add_argument("--learning_rate", type=float, default=3e-5, help="Learning rate.") + train_parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon for Adam optimizer.") train_parser.set_defaults(func=train_command_factory) def __init__(self, args: Namespace): - self.logger = getLogger('transformers-cli/training') + self.logger = getLogger("transformers-cli/training") - self.framework = 'tf' if is_tf_available() else 'torch' + self.framework = "tf" if is_tf_available() else "torch" os.makedirs(args.output, exist_ok=True) assert os.path.isdir(args.output) @@ -81,28 +91,32 @@ class TrainCommand(BaseTransformersCLICommand): self.column_text = args.column_text self.column_id = args.column_id - self.logger.info('Loading {} pipeline for {}'.format(args.task, args.model)) - if args.task == 'text_classification': + self.logger.info("Loading {} pipeline for {}".format(args.task, args.model)) + if args.task == "text_classification": self.pipeline = TextClassificationPipeline.from_pretrained(args.model) - elif args.task == 'token_classification': + elif args.task == "token_classification": raise NotImplementedError - elif args.task == 'question_answering': + elif args.task == "question_answering": raise NotImplementedError - self.logger.info('Loading dataset from {}'.format(args.train_data)) - self.train_dataset = Processor.create_from_csv(args.train_data, - column_label=args.column_label, - column_text=args.column_text, - column_id=args.column_id, - skip_first_row=args.skip_first_row) + self.logger.info("Loading dataset from {}".format(args.train_data)) + self.train_dataset = Processor.create_from_csv( + args.train_data, + column_label=args.column_label, + column_text=args.column_text, + column_id=args.column_id, + skip_first_row=args.skip_first_row, + ) self.valid_dataset = None if args.validation_data: - self.logger.info('Loading validation dataset from {}'.format(args.validation_data)) - self.valid_dataset = Processor.create_from_csv(args.validation_data, - column_label=args.column_label, - column_text=args.column_text, - column_id=args.column_id, - skip_first_row=args.skip_first_row) + self.logger.info("Loading validation dataset from {}".format(args.validation_data)) + self.valid_dataset = Processor.create_from_csv( + args.validation_data, + column_label=args.column_label, + column_text=args.column_text, + column_id=args.column_id, + skip_first_row=args.skip_first_row, + ) self.validation_split = args.validation_split self.train_batch_size = args.train_batch_size @@ -111,7 +125,7 @@ class TrainCommand(BaseTransformersCLICommand): self.adam_epsilon = args.adam_epsilon def run(self): - if self.framework == 'tf': + if self.framework == "tf": return self.run_tf() return self.run_torch() @@ -119,13 +133,15 @@ class TrainCommand(BaseTransformersCLICommand): raise NotImplementedError def run_tf(self): - self.pipeline.fit(self.train_dataset, - validation_data=self.valid_dataset, - validation_split=self.validation_split, - learning_rate=self.learning_rate, - adam_epsilon=self.adam_epsilon, - train_batch_size=self.train_batch_size, - valid_batch_size=self.valid_batch_size) + self.pipeline.fit( + self.train_dataset, + validation_data=self.valid_dataset, + validation_split=self.validation_split, + learning_rate=self.learning_rate, + adam_epsilon=self.adam_epsilon, + train_batch_size=self.train_batch_size, + valid_batch_size=self.valid_batch_size, + ) # Save trained pipeline self.pipeline.save_pretrained(self.output) diff --git a/transformers/commands/user.py b/transformers/commands/user.py index 8e0e563422..d29867d7c8 100644 --- a/transformers/commands/user.py +++ b/transformers/commands/user.py @@ -9,28 +9,31 @@ from transformers.hf_api import HfApi, HfFolder, HTTPError class UserCommands(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): - login_parser = parser.add_parser('login') + login_parser = parser.add_parser("login") login_parser.set_defaults(func=lambda args: LoginCommand(args)) - whoami_parser = parser.add_parser('whoami') + whoami_parser = parser.add_parser("whoami") whoami_parser.set_defaults(func=lambda args: WhoamiCommand(args)) - logout_parser = parser.add_parser('logout') + logout_parser = parser.add_parser("logout") logout_parser.set_defaults(func=lambda args: LogoutCommand(args)) - list_parser = parser.add_parser('ls') + list_parser = parser.add_parser("ls") list_parser.set_defaults(func=lambda args: ListObjsCommand(args)) # upload - upload_parser = parser.add_parser('upload') - upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.') - upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.') + upload_parser = parser.add_parser("upload") + upload_parser.add_argument("path", type=str, help="Local path of the folder or individual file to upload.") + upload_parser.add_argument( + "--filename", type=str, default=None, help="Optional: override individual object filename on S3." + ) upload_parser.set_defaults(func=lambda args: UploadCommand(args)) - class ANSI: """ Helper for en.wikipedia.org/wiki/ANSI_escape_code """ + _bold = u"\u001b[1m" _reset = u"\u001b[0m" + @classmethod def bold(cls, s): return "{}{}{}".format(cls._bold, s, cls._reset) @@ -44,14 +47,16 @@ class BaseUserCommand: class LoginCommand(BaseUserCommand): def run(self): - print(""" + print( + """ _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_| _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _| _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_| - """) + """ + ) username = input("Username: ") password = getpass() try: @@ -101,16 +106,10 @@ class ListObjsCommand(BaseUserCommand): col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)] row_format = ("{{:{}}} " * len(headers)).format(*col_widths) lines = [] - lines.append( - row_format.format(*headers) - ) - lines.append( - row_format.format(*["-" * w for w in col_widths]) - ) + lines.append(row_format.format(*headers)) + lines.append(row_format.format(*["-" * w for w in col_widths])) for row in rows: - lines.append( - row_format.format(*row) - ) + lines.append(row_format.format(*row)) return "\n".join(lines) def run(self): @@ -126,15 +125,8 @@ class ListObjsCommand(BaseUserCommand): if len(objs) == 0: print("No shared file yet") exit() - rows = [ [ - obj.filename, - obj.LastModified, - obj.ETag, - obj.Size - ] for obj in objs ] - print( - self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"]) - ) + rows = [[obj.filename, obj.LastModified, obj.ETag, obj.Size] for obj in objs] + print(self.tabulate(rows, headers=["Filename", "LastModified", "ETag", "Size"])) class UploadCommand(BaseUserCommand): @@ -143,13 +135,7 @@ class UploadCommand(BaseUserCommand): Recursively list all files in a folder. """ entries: List[os.DirEntry] = list(os.scandir(rel_path)) - files = [ - ( - os.path.join(os.getcwd(), f.path), # filepath - f.path # filename - ) - for f in entries if f.is_file() - ] + files = [(os.path.join(os.getcwd(), f.path), f.path) for f in entries if f.is_file()] # filepath # filename for f in entries: if f.is_dir(): files += self.walk_dir(f.path) @@ -173,22 +159,14 @@ class UploadCommand(BaseUserCommand): raise ValueError("Not a valid file or directory: {}".format(local_path)) for filepath, filename in files: - print( - "About to upload file {} to S3 under filename {}".format( - ANSI.bold(filepath), ANSI.bold(filename) - ) - ) + print("About to upload file {} to S3 under filename {}".format(ANSI.bold(filepath), ANSI.bold(filename))) choice = input("Proceed? [Y/n] ").lower() - if not(choice == "" or choice == "y" or choice == "yes"): + if not (choice == "" or choice == "y" or choice == "yes"): print("Abort") exit() - print( - ANSI.bold("Uploading... This might take a while if files are large") - ) + print(ANSI.bold("Uploading... This might take a while if files are large")) for filepath, filename in files: - access_url = self._api.presign_and_upload( - token=token, filename=filename, filepath=filepath - ) + access_url = self._api.presign_and_upload(token=token, filename=filename, filepath=filepath) print("Your file now lives at:") print(access_url) diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py index 6a1ef78dd5..dc2b74a29c 100644 --- a/transformers/configuration_albert.py +++ b/transformers/configuration_albert.py @@ -18,16 +18,17 @@ from .configuration_utils import PretrainedConfig ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json", - 'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json", - 'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json", - 'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json", - 'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json", - 'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json", - 'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json", - 'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json", + "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-config.json", + "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-config.json", + "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-config.json", + "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-config.json", + "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-config.json", + "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-config.json", + "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-config.json", + "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-config.json", } + class AlbertConfig(PretrainedConfig): """Configuration for `AlbertModel`. @@ -36,22 +37,25 @@ class AlbertConfig(PretrainedConfig): pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=30000, - embedding_size=128, - hidden_size=4096, - num_hidden_layers=12, - num_hidden_groups=1, - num_attention_heads=64, - intermediate_size=16384, - inner_group_num=1, - hidden_act="gelu_new", - hidden_dropout_prob=0, - attention_probs_dropout_prob=0, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, **kwargs): + def __init__( + self, + vocab_size=30000, + embedding_size=128, + hidden_size=4096, + num_hidden_layers=12, + num_hidden_groups=1, + num_attention_heads=64, + intermediate_size=16384, + inner_group_num=1, + hidden_act="gelu_new", + hidden_dropout_prob=0, + attention_probs_dropout_prob=0, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + **kwargs + ): """Constructs AlbertConfig. Args: diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py index 281256389e..e4311fc285 100644 --- a/transformers/configuration_auto.py +++ b/transformers/configuration_auto.py @@ -35,7 +35,8 @@ from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_ logger = logging.getLogger(__name__) -ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value) +ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( + (key, value) for pretrained_map in [ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -50,8 +51,9 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value) CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, - ] - for key, value, in pretrained_map.items()) + ] + for key, value, in pretrained_map.items() +) class AutoConfig(object): @@ -79,37 +81,42 @@ class AutoConfig(object): - contains `ctrl` : CTRLConfig (CTRL model) This class cannot be instantiated using `__init__()` (throw an error). """ + def __init__(self): - raise EnvironmentError("AutoConfig is designed to be instantiated " - "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") + raise EnvironmentError( + "AutoConfig is designed to be instantiated " + "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method." + ) @classmethod def for_model(cls, model_type, *args, **kwargs): - if 'distilbert' in model_type: + if "distilbert" in model_type: return DistilBertConfig(*args, **kwargs) - elif 'roberta' in model_type: + elif "roberta" in model_type: return RobertaConfig(*args, **kwargs) - elif 'bert' in model_type: + elif "bert" in model_type: return BertConfig(*args, **kwargs) - elif 'openai-gpt' in model_type: + elif "openai-gpt" in model_type: return OpenAIGPTConfig(*args, **kwargs) - elif 'gpt2' in model_type: + elif "gpt2" in model_type: return GPT2Config(*args, **kwargs) - elif 'transfo-xl' in model_type: + elif "transfo-xl" in model_type: return TransfoXLConfig(*args, **kwargs) - elif 'xlnet' in model_type: + elif "xlnet" in model_type: return XLNetConfig(*args, **kwargs) - elif 'xlm' in model_type: + elif "xlm" in model_type: return XLMConfig(*args, **kwargs) - elif 'ctrl' in model_type: + elif "ctrl" in model_type: return CTRLConfig(*args, **kwargs) - elif 'albert' in model_type: + elif "albert" in model_type: return AlbertConfig(*args, **kwargs) - elif 'camembert' in model_type: + elif "camembert" in model_type: return CamembertConfig(*args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type) + ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): @@ -176,32 +183,36 @@ class AutoConfig(object): assert unused_kwargs == {'foo': False} """ - if 't5' in pretrained_model_name_or_path: + if "t5" in pretrained_model_name_or_path: return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: + elif "distilbert" in pretrained_model_name_or_path: return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'albert' in pretrained_model_name_or_path: + elif "albert" in pretrained_model_name_or_path: return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'camembert' in pretrained_model_name_or_path: + elif "camembert" in pretrained_model_name_or_path: return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'xlm-roberta' in pretrained_model_name_or_path: + elif "xlm-roberta" in pretrained_model_name_or_path: return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: + elif "roberta" in pretrained_model_name_or_path: return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'openai-gpt' in pretrained_model_name_or_path: + elif "openai-gpt" in pretrained_model_name_or_path: return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'gpt2' in pretrained_model_name_or_path: + elif "gpt2" in pretrained_model_name_or_path: return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'transfo-xl' in pretrained_model_name_or_path: + elif "transfo-xl" in pretrained_model_name_or_path: return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - elif 'ctrl' in pretrained_model_name_or_path: + elif "ctrl" in pretrained_model_name_or_path: return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format( + pretrained_model_name_or_path + ) + ) diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index 7b495013ff..7c5ee434a4 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -27,27 +27,27 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", - 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", - 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", - 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", - 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", - 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", - 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", - 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", - 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", - 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json", - 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", - 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", - 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json", + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", + "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", + "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", + "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", + "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json", + "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", + "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", + "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json", + "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json", + "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json", } @@ -82,20 +82,22 @@ class BertConfig(PretrainedConfig): """ pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - **kwargs): + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + **kwargs + ): super(BertConfig, self).__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size diff --git a/transformers/configuration_camembert.py b/transformers/configuration_camembert.py index 3ff64454e5..9aa641aa5f 100644 --- a/transformers/configuration_camembert.py +++ b/transformers/configuration_camembert.py @@ -15,8 +15,7 @@ # limitations under the License. """ CamemBERT configuration """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging @@ -25,7 +24,7 @@ from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", + "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", } diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py index f9b9e409e1..2726727d48 100644 --- a/transformers/configuration_ctrl.py +++ b/transformers/configuration_ctrl.py @@ -27,6 +27,7 @@ logger = logging.getLogger(__name__) CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"} + class CTRLConfig(PretrainedConfig): """Configuration class to store the configuration of a `CTRLModel`. @@ -48,6 +49,7 @@ class CTRLConfig(PretrainedConfig): initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ + pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__( @@ -64,7 +66,7 @@ class CTRLConfig(PretrainedConfig): attn_pdrop=0.1, layer_norm_epsilon=1e-6, initializer_range=0.02, - summary_type='cls_index', + summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py index d9f7cc6348..120cbfb9f2 100644 --- a/transformers/configuration_distilbert.py +++ b/transformers/configuration_distilbert.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ DistilBERT model configuration """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import sys import json @@ -26,32 +25,34 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", - 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", - 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", - 'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", + "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", + "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json", + "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-config.json", + "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-config.json", } class DistilBertConfig(PretrainedConfig): pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=30522, - max_position_embeddings=512, - sinusoidal_pos_embds=False, - n_layers=6, - n_heads=12, - dim=768, - hidden_dim=4*768, - dropout=0.1, - attention_dropout=0.1, - activation='gelu', - initializer_range=0.02, - tie_weights_=True, - qa_dropout=0.1, - seq_classif_dropout=0.2, - **kwargs): + def __init__( + self, + vocab_size=30522, + max_position_embeddings=512, + sinusoidal_pos_embds=False, + n_layers=6, + n_heads=12, + dim=768, + hidden_dim=4 * 768, + dropout=0.1, + attention_dropout=0.1, + activation="gelu", + initializer_range=0.02, + tie_weights_=True, + qa_dropout=0.1, + seq_classif_dropout=0.2, + **kwargs + ): super(DistilBertConfig, self).__init__(**kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py index 4c200c0760..adc8842edc 100644 --- a/transformers/configuration_gpt2.py +++ b/transformers/configuration_gpt2.py @@ -26,11 +26,14 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) -GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", - "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", - "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json", - "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json", - "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",} +GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json", + "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-config.json", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json", +} + class GPT2Config(PretrainedConfig): """Configuration class to store the configuration of a `GPT2Model`. @@ -52,6 +55,7 @@ class GPT2Config(PretrainedConfig): initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ + pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__( @@ -67,7 +71,7 @@ class GPT2Config(PretrainedConfig): attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, - summary_type='cls_index', + summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, diff --git a/transformers/configuration_mmbt.py b/transformers/configuration_mmbt.py index 60176c9872..5dad2babef 100644 --- a/transformers/configuration_mmbt.py +++ b/transformers/configuration_mmbt.py @@ -15,8 +15,7 @@ # limitations under the License. """ MMBT configuration """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging @@ -31,6 +30,7 @@ class MMBTConfig(object): num_labels: Size of final Linear layer for classification. modal_hidden_size: Embedding dimension of the non-text modality encoder. """ + def __init__(self, config, num_labels=None, modal_hidden_size=2048): self.__dict__ = config.__dict__ self.modal_hidden_size = modal_hidden_size diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py index 7776a0bb9f..53929aab5f 100644 --- a/transformers/configuration_openai.py +++ b/transformers/configuration_openai.py @@ -30,6 +30,7 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" } + class OpenAIGPTConfig(PretrainedConfig): """ Configuration class to store the configuration of a `OpenAIGPTModel`. @@ -54,6 +55,7 @@ class OpenAIGPTConfig(PretrainedConfig): initializing all weight matrices. predict_special_tokens: should we predict special tokens (when the model has a LM head) """ + pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__( @@ -71,7 +73,7 @@ class OpenAIGPTConfig(PretrainedConfig): layer_norm_epsilon=1e-5, initializer_range=0.02, predict_special_tokens=True, - summary_type='cls_index', + summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, diff --git a/transformers/configuration_roberta.py b/transformers/configuration_roberta.py index 842edac56e..3b8ddd1c46 100644 --- a/transformers/configuration_roberta.py +++ b/transformers/configuration_roberta.py @@ -15,8 +15,7 @@ # limitations under the License. """ RoBERTa configuration """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging @@ -25,12 +24,12 @@ from .configuration_bert import BertConfig logger = logging.getLogger(__name__) ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", - 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", - 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", - 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", - 'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", - 'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json", + "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-config.json", + "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-config.json", } diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 377a0919d9..4584015e27 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -27,11 +27,11 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", - 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", - 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", - 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", - 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", + "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", + "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", + "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", + "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", + "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", } @@ -65,19 +65,21 @@ class T5Config(PretrainedConfig): """ pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=32128, - n_positions=512, - d_model=512, - d_kv=64, - d_ff=2048, - num_layers=6, - num_heads=8, - relative_attention_num_buckets=32, - dropout_rate=0.1, - layer_norm_epsilon=1e-6, - initializer_factor=1.0, - **kwargs): + def __init__( + self, + vocab_size=32128, + n_positions=512, + d_model=512, + d_kv=64, + d_ff=2048, + num_layers=6, + num_heads=8, + relative_attention_num_buckets=32, + dropout_rate=0.1, + layer_norm_epsilon=1e-6, + initializer_factor=1.0, + **kwargs + ): super(T5Config, self).__init__(**kwargs) self.vocab_size = vocab_size self.n_positions = n_positions diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py index 52f0f45a50..a2a7c5c02d 100644 --- a/transformers/configuration_transfo_xl.py +++ b/transformers/configuration_transfo_xl.py @@ -27,9 +27,10 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", } + class TransfoXLConfig(PretrainedConfig): """Configuration class to store the configuration of a `TransfoXLModel`. @@ -65,38 +66,41 @@ class TransfoXLConfig(PretrainedConfig): proj_init_std: parameters initialized by N(0, init_std) init_std: parameters initialized by N(0, init_std) """ + pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=267735, - cutoffs=[20000, 40000, 200000], - d_model=1024, - d_embed=1024, - n_head=16, - d_head=64, - d_inner=4096, - div_val=4, - pre_lnorm=False, - n_layer=18, - tgt_len=128, - ext_len=0, - mem_len=1600, - clamp_len=1000, - same_length=True, - proj_share_all_but_first=True, - attn_type=0, - sample_softmax=-1, - adaptive=True, - tie_weight=True, - dropout=0.1, - dropatt=0.0, - untie_r=True, - init="normal", - init_range=0.01, - proj_init_std=0.01, - init_std=0.02, - layer_norm_epsilon=1e-5, - **kwargs): + def __init__( + self, + vocab_size=267735, + cutoffs=[20000, 40000, 200000], + d_model=1024, + d_embed=1024, + n_head=16, + d_head=64, + d_inner=4096, + div_val=4, + pre_lnorm=False, + n_layer=18, + tgt_len=128, + ext_len=0, + mem_len=1600, + clamp_len=1000, + same_length=True, + proj_share_all_but_first=True, + attn_type=0, + sample_softmax=-1, + adaptive=True, + tie_weight=True, + dropout=0.1, + dropatt=0.0, + untie_r=True, + init="normal", + init_range=0.01, + proj_init_std=0.01, + init_std=0.02, + layer_norm_epsilon=1e-5, + **kwargs + ): """Constructs TransfoXLConfig. """ super(TransfoXLConfig, self).__init__(**kwargs) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index d2d6ee5d80..f29899175c 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -15,8 +15,7 @@ # limitations under the License. """ Configuration base class and utilities.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import copy import json @@ -28,6 +27,7 @@ from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url logger = logging.getLogger(__name__) + class PretrainedConfig(object): r""" Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. @@ -50,36 +50,36 @@ class PretrainedConfig(object): def __init__(self, **kwargs): # Attributes with defaults - self.output_attentions = kwargs.pop('output_attentions', False) - self.output_hidden_states = kwargs.pop('output_hidden_states', False) - self.output_past = kwargs.pop('output_past', True) # Not used by all models - self.torchscript = kwargs.pop('torchscript', False) # Only used by PyTorch models - self.use_bfloat16 = kwargs.pop('use_bfloat16', False) - self.pruned_heads = kwargs.pop('pruned_heads', {}) + self.output_attentions = kwargs.pop("output_attentions", False) + self.output_hidden_states = kwargs.pop("output_hidden_states", False) + self.output_past = kwargs.pop("output_past", True) # Not used by all models + self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models + self.use_bfloat16 = kwargs.pop("use_bfloat16", False) + self.pruned_heads = kwargs.pop("pruned_heads", {}) # Is decoder is used in encoder-decoder models to differentiate encoder from decoder - self.is_decoder = kwargs.pop('is_decoder', False) + self.is_decoder = kwargs.pop("is_decoder", False) # Parameters for sequence generation - self.max_length = kwargs.pop('max_length', 20) - self.do_sample = kwargs.pop('do_sample', False) - self.num_beams = kwargs.pop('num_beams', 1) - self.temperature = kwargs.pop('temperature', 1.0) - self.top_k = kwargs.pop('top_k', 50) - self.top_p = kwargs.pop('top_p', 1.0) - self.repetition_penalty = kwargs.pop('repetition_penalty', 1.0) - self.bos_token_id = kwargs.pop('bos_token_id', 0) - self.pad_token_id = kwargs.pop('pad_token_id', 0) - self.eos_token_ids = kwargs.pop('eos_token_ids', 0) - self.length_penalty = kwargs.pop('length_penalty', 1.) - self.num_return_sequences = kwargs.pop('num_return_sequences', 1) + self.max_length = kwargs.pop("max_length", 20) + self.do_sample = kwargs.pop("do_sample", False) + self.num_beams = kwargs.pop("num_beams", 1) + self.temperature = kwargs.pop("temperature", 1.0) + self.top_k = kwargs.pop("top_k", 50) + self.top_p = kwargs.pop("top_p", 1.0) + self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) + self.bos_token_id = kwargs.pop("bos_token_id", 0) + self.pad_token_id = kwargs.pop("pad_token_id", 0) + self.eos_token_ids = kwargs.pop("eos_token_ids", 0) + self.length_penalty = kwargs.pop("length_penalty", 1.0) + self.num_return_sequences = kwargs.pop("num_return_sequences", 1) # Fine-tuning task arguments - self.finetuning_task = kwargs.pop('finetuning_task', None) - self.num_labels = kwargs.pop('num_labels', 2) - self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)}) + self.finetuning_task = kwargs.pop("finetuning_task", None) + self.num_labels = kwargs.pop("num_labels", 2) + self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)}) self.id2label = dict((int(key), value) for key, value in self.id2label.items()) - self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys()))) + self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys()))) self.label2id = dict((key, int(value)) for key, value in self.label2id.items()) # Additional attributes without default values @@ -94,7 +94,9 @@ class PretrainedConfig(object): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. """ - assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" + assert os.path.isdir( + save_directory + ), "Saving path should be a directory where the model and configuration can be saved" # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(save_directory, CONFIG_NAME) @@ -153,11 +155,11 @@ class PretrainedConfig(object): assert unused_kwargs == {'foo': False} """ - cache_dir = kwargs.pop('cache_dir', None) - force_download = kwargs.pop('force_download', False) - resume_download = kwargs.pop('resume_download', False) - proxies = kwargs.pop('proxies', None) - return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) if pretrained_model_name_or_path in cls.pretrained_config_archive_map: config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] @@ -170,37 +172,48 @@ class PretrainedConfig(object): try: # Load from URL or cache if already cached - resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, - proxies=proxies, resume_download=resume_download) + resolved_config_file = cached_path( + config_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + ) # Load config config = cls.from_json_file(resolved_config_file) except EnvironmentError: if pretrained_model_name_or_path in cls.pretrained_config_archive_map: msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( - config_file) + config_file + ) else: - msg = "Model name '{}' was not found in model name list ({}). " \ - "We assumed '{}' was a path or url to a configuration file named {} or " \ - "a directory containing such a file but couldn't find any such file at this path or url.".format( + msg = ( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url to a configuration file named {} or " + "a directory containing such a file but couldn't find any such file at this path or url.".format( pretrained_model_name_or_path, - ', '.join(cls.pretrained_config_archive_map.keys()), - config_file, CONFIG_NAME) + ", ".join(cls.pretrained_config_archive_map.keys()), + config_file, + CONFIG_NAME, + ) + ) raise EnvironmentError(msg) except json.JSONDecodeError: - msg = "Couldn't reach server at '{}' to download configuration file or " \ - "configuration file is not a valid JSON file. " \ - "Please check network or file content here: {}.".format(config_file, resolved_config_file) + msg = ( + "Couldn't reach server at '{}' to download configuration file or " + "configuration file is not a valid JSON file. " + "Please check network or file content here: {}.".format(config_file, resolved_config_file) + ) raise EnvironmentError(msg) if resolved_config_file == config_file: logger.info("loading configuration file {}".format(config_file)) else: - logger.info("loading configuration file {} from cache at {}".format( - config_file, resolved_config_file)) + logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file)) - if hasattr(config, 'pruned_heads'): + if hasattr(config, "pruned_heads"): config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) # Update config with kwargs if needed @@ -226,7 +239,7 @@ class PretrainedConfig(object): @classmethod def from_json_file(cls, json_file): """Constructs a `Config` from a json file of parameters.""" - with open(json_file, "r", encoding='utf-8') as reader: + with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() dict_obj = json.loads(text) return cls(**dict_obj) @@ -248,5 +261,5 @@ class PretrainedConfig(object): def to_json_file(self, json_file_path): """ Save this instance to a json file.""" - with open(json_file_path, "w", encoding='utf-8') as writer: + with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string()) diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py index 727f319778..a98024e9e6 100644 --- a/transformers/configuration_xlm.py +++ b/transformers/configuration_xlm.py @@ -25,16 +25,16 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", - 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", - 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", - 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", - 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", - 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", - 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", - 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", - 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", - 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", + "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", + "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", + "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", + "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", + "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", + "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", + "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", + "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", + "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", + "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", } @@ -78,41 +78,44 @@ class XLMConfig(PretrainedConfig): -1 means no clamping. same_length: bool, whether to use the same attention length for each token. """ + pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=30145, - emb_dim=2048, - n_layers=12, - n_heads=16, - dropout=0.1, - attention_dropout=0.1, - gelu_activation=True, - sinusoidal_embeddings=False, - causal=False, - asm=False, - n_langs=1, - use_lang_emb=True, - max_position_embeddings=512, - embed_init_std=2048 ** -0.5, - layer_norm_eps=1e-12, - init_std=0.02, - bos_index=0, - eos_index=1, - pad_index=2, - unk_index=3, - mask_index=5, - is_encoder=True, - summary_type='first', - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, - start_n_top=5, - end_n_top=5, - mask_token_id=0, - lang_id=0, - **kwargs): + def __init__( + self, + vocab_size=30145, + emb_dim=2048, + n_layers=12, + n_heads=16, + dropout=0.1, + attention_dropout=0.1, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=1, + use_lang_emb=True, + max_position_embeddings=512, + embed_init_std=2048 ** -0.5, + layer_norm_eps=1e-12, + init_std=0.02, + bos_index=0, + eos_index=1, + pad_index=2, + unk_index=3, + mask_index=5, + is_encoder=True, + summary_type="first", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + start_n_top=5, + end_n_top=5, + mask_token_id=0, + lang_id=0, + **kwargs + ): """Constructs XLMConfig. """ super(XLMConfig, self).__init__(**kwargs) diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py index 5b6955f4f8..fcf5c571d8 100644 --- a/transformers/configuration_xlm_roberta.py +++ b/transformers/configuration_xlm_roberta.py @@ -15,8 +15,7 @@ # limitations under the License. """ XLM-RoBERTa configuration """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging @@ -25,12 +24,12 @@ from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", - 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", - 'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", - 'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", - 'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", - 'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", + "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", + "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", + "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", + "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", + "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", + "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", } diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py index 017c57cfd5..8768aeac9b 100644 --- a/transformers/configuration_xlnet.py +++ b/transformers/configuration_xlnet.py @@ -26,8 +26,8 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", - 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", + "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", + "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", } @@ -69,32 +69,35 @@ class XLNetConfig(PretrainedConfig): same_length: bool, whether to use the same attention length for each token. finetuning_task: name of the glue task on which the model was fine-tuned if any """ + pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP - def __init__(self, - vocab_size=32000, - d_model=1024, - n_layer=24, - n_head=16, - d_inner=4096, - ff_activation="gelu", - untie_r=True, - attn_type="bi", - initializer_range=0.02, - layer_norm_eps=1e-12, - dropout=0.1, - mem_len=None, - reuse_len=None, - bi_data=False, - clamp_len=-1, - same_length=False, - summary_type='last', - summary_use_proj=True, - summary_activation='tanh', - summary_last_dropout=0.1, - start_n_top=5, - end_n_top=5, - **kwargs): + def __init__( + self, + vocab_size=32000, + d_model=1024, + n_layer=24, + n_head=16, + d_inner=4096, + ff_activation="gelu", + untie_r=True, + attn_type="bi", + initializer_range=0.02, + layer_norm_eps=1e-12, + dropout=0.1, + mem_len=None, + reuse_len=None, + bi_data=False, + clamp_len=-1, + same_length=False, + summary_type="last", + summary_use_proj=True, + summary_activation="tanh", + summary_last_dropout=0.1, + start_n_top=5, + end_n_top=5, + **kwargs + ): """Constructs XLNetConfig. """ super(XLNetConfig, self).__init__(**kwargs) diff --git a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py index b6476b4fb6..733f6fc5ca 100644 --- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py @@ -24,6 +24,7 @@ import torch from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert import logging + logging.basicConfig(level=logging.INFO) @@ -44,24 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pyt if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--tf_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--albert_config_file", - default = None, - type = str, - required = True, - help = "The config json file corresponding to the pre-trained ALBERT model. \n" - "This specifies the model architecture.") - parser.add_argument("--pytorch_dump_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--albert_config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the pre-trained ALBERT model. \n" + "This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.albert_config_file, - args.pytorch_dump_path) - \ No newline at end of file + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) diff --git a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py index 75808811ef..9393068b17 100755 --- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py @@ -24,8 +24,10 @@ import torch from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert import logging + logging.basicConfig(level=logging.INFO) + def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) @@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--tf_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--bert_config_file", - default = None, - type = str, - required = True, - help = "The config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture.") - parser.add_argument("--pytorch_dump_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--bert_config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.bert_config_file, - args.pytorch_dump_path) + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py index 35866caac4..304c634502 100644 --- a/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py +++ b/transformers/convert_bert_pytorch_checkpoint_to_original_tf.py @@ -23,7 +23,7 @@ import tensorflow as tf from transformers import BertModel -def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): +def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): """ :param model:BertModel Pytorch model instance to be converted @@ -41,22 +41,17 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s N BertForQuestionAnswering """ - tensors_to_transpose = ( - "dense.weight", - "attention.self.query", - "attention.self.key", - "attention.self.value" - ) + tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") var_map = ( - ('layer.', 'layer_'), - ('word_embeddings.weight', 'word_embeddings'), - ('position_embeddings.weight', 'position_embeddings'), - ('token_type_embeddings.weight', 'token_type_embeddings'), - ('.', '/'), - ('LayerNorm/weight', 'LayerNorm/gamma'), - ('LayerNorm/bias', 'LayerNorm/beta'), - ('weight', 'kernel') + ("layer.", "layer_"), + ("word_embeddings.weight", "word_embeddings"), + ("position_embeddings.weight", "position_embeddings"), + ("token_type_embeddings.weight", "token_type_embeddings"), + (".", "/"), + ("LayerNorm/weight", "LayerNorm/gamma"), + ("LayerNorm/bias", "LayerNorm/beta"), + ("weight", "kernel"), ) if not os.path.isdir(ckpt_dir): @@ -64,12 +59,12 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s state_dict = model.state_dict() - def to_tf_var_name(name:str): + def to_tf_var_name(name: str): for patt, repl in iter(var_map): name = name.replace(patt, repl) - return 'bert/{}'.format(name) + return "bert/{}".format(name) - def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session): + def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) @@ -94,37 +89,22 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s def main(raw_args=None): parser = argparse.ArgumentParser() - parser.add_argument("--model_name", - type=str, - required=True, - help="model name e.g. bert-base-uncased") - parser.add_argument("--cache_dir", - type=str, - default=None, - required=False, - help="Directory containing pytorch model") - parser.add_argument("--pytorch_model_path", - type=str, - required=True, - help="/path/to/.bin") - parser.add_argument("--tf_cache_dir", - type=str, - required=True, - help="Directory in which to save tensorflow model") + parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") + parser.add_argument( + "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" + ) + parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") + parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") args = parser.parse_args(raw_args) - + model = BertModel.from_pretrained( pretrained_model_name_or_path=args.model_name, state_dict=torch.load(args.pytorch_model_path), - cache_dir=args.cache_dir - ) - - convert_pytorch_checkpoint_to_tf( - model=model, - ckpt_dir=args.tf_cache_dir, - model_name=args.model_name + cache_dir=args.cache_dir, ) + convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) + if __name__ == "__main__": main() diff --git a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py index e2328c08ca..eeafdb81e5 100755 --- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py @@ -21,12 +21,10 @@ from io import open import torch -from transformers import (CONFIG_NAME, WEIGHTS_NAME, - GPT2Config, - GPT2Model, - load_tf_weights_in_gpt2) +from transformers import CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2 import logging + logging.basicConfig(level=logging.INFO) @@ -42,8 +40,8 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME + pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME + pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) @@ -54,22 +52,18 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--gpt2_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--pytorch_dump_folder_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") - parser.add_argument("--gpt2_config_file", - default = "", - type = str, - help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" - "This specifies the model architecture.") + parser.add_argument( + "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--gpt2_config_file", + default="", + type=str, + help="An optional config json file corresponding to the pre-trained OpenAI model. \n" + "This specifies the model architecture.", + ) args = parser.parse_args() - convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, - args.gpt2_config_file, - args.pytorch_dump_folder_path) + convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) diff --git a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py index 13ebecf2fd..c87bb9d594 100755 --- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py @@ -21,12 +21,10 @@ from io import open import torch -from transformers import (CONFIG_NAME, WEIGHTS_NAME, - OpenAIGPTConfig, - OpenAIGPTModel, - load_tf_weights_in_openai_gpt) +from transformers import CONFIG_NAME, WEIGHTS_NAME, OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt import logging + logging.basicConfig(level=logging.INFO) @@ -42,8 +40,8 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME + pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME + pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to {}".format(pytorch_config_dump_path)) @@ -54,22 +52,24 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--openai_checkpoint_folder_path", - default = None, - type = str, - required = True, - help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--pytorch_dump_folder_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") - parser.add_argument("--openai_config_file", - default = "", - type = str, - help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" - "This specifies the model architecture.") + parser.add_argument( + "--openai_checkpoint_folder_path", + default=None, + type=str, + required=True, + help="Path to the TensorFlow checkpoint path.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--openai_config_file", + default="", + type=str, + help="An optional config json file corresponding to the pre-trained OpenAI model. \n" + "This specifies the model architecture.", + ) args = parser.parse_args() - convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, - args.openai_config_file, - args.pytorch_dump_folder_path) + convert_openai_checkpoint_to_pytorch( + args.openai_checkpoint_folder_path, args.openai_config_file, args.pytorch_dump_folder_path + ) diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index 0edac6fb7d..c7ad66e132 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -24,82 +24,270 @@ import tensorflow as tf from transformers import is_torch_available, cached_path -from transformers import (load_pytorch_checkpoint_in_tf2_model, - BertConfig, TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - GPT2Config, TFGPT2LMHeadModel, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLNetConfig, TFXLNetLMHeadModel, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLMConfig, TFXLMWithLMHeadModel, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, - TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, - OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, - RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, - DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, - AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP) +from transformers import ( + load_pytorch_checkpoint_in_tf2_model, + BertConfig, + TFBertForPreTraining, + TFBertForQuestionAnswering, + TFBertForSequenceClassification, + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + GPT2Config, + TFGPT2LMHeadModel, + GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, + XLNetConfig, + TFXLNetLMHeadModel, + XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, + XLMConfig, + TFXLMWithLMHeadModel, + XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, + TransfoXLConfig, + TFTransfoXLLMHeadModel, + TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, + OpenAIGPTConfig, + TFOpenAIGPTLMHeadModel, + OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, + RobertaConfig, + TFRobertaForMaskedLM, + TFRobertaForSequenceClassification, + ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, + DistilBertConfig, + TFDistilBertForMaskedLM, + TFDistilBertForQuestionAnswering, + TFDistilBertForSequenceClassification, + DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + CTRLConfig, + TFCTRLLMHeadModel, + CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, + AlbertConfig, + TFAlbertForMaskedLM, + ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + T5Config, + TFT5WithLMHeadModel, + T5_PRETRAINED_CONFIG_ARCHIVE_MAP, +) if is_torch_available(): import torch import numpy as np - from transformers import (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, - XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, - XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, - TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, - OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, - RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, - DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, - AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers import ( + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + GPT2LMHeadModel, + GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + XLNetLMHeadModel, + XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + XLMWithLMHeadModel, + XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + TransfoXLLMHeadModel, + TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + OpenAIGPTLMHeadModel, + OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + RobertaForMaskedLM, + RobertaForSequenceClassification, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + DistilBertForMaskedLM, + DistilBertForQuestionAnswering, + DistilBertForSequenceClassification, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + CTRLLMHeadModel, + CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + AlbertForMaskedLM, + ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + T5WithLMHeadModel, + T5_PRETRAINED_MODEL_ARCHIVE_MAP, + ) else: - (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, - XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, - XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, - TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, - OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, - RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, - DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, - AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = ( - None, None, None, None, - None, None, - None, None, - None, None, - None, None, - None, None, - None, None, None, - None, None, None, None, - None, None, - None, None, - None, None) + ( + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + GPT2LMHeadModel, + GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + XLNetLMHeadModel, + XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + XLMWithLMHeadModel, + XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + TransfoXLLMHeadModel, + TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + OpenAIGPTLMHeadModel, + OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + RobertaForMaskedLM, + RobertaForSequenceClassification, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + DistilBertForMaskedLM, + DistilBertForSequenceClassification, + DistilBertForQuestionAnswering, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + CTRLLMHeadModel, + CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + AlbertForMaskedLM, + ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + T5WithLMHeadModel, + T5_PRETRAINED_MODEL_ARCHIVE_MAP, + ) = ( + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ) import logging + logging.basicConfig(level=logging.INFO) MODEL_CLASSES = { - 'bert': (BertConfig, TFBertForPreTraining, BertForPreTraining, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'bert-large-uncased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'bert-large-cased-whole-word-masking-finetuned-squad': (BertConfig, TFBertForQuestionAnswering, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'bert-base-cased-finetuned-mrpc': (BertConfig, TFBertForSequenceClassification, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'gpt2': (GPT2Config, TFGPT2LMHeadModel, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'xlnet': (XLNetConfig, TFXLNetLMHeadModel, XLNetLMHeadModel, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'xlm': (XLMConfig, TFXLMWithLMHeadModel, XLMWithLMHeadModel, XLM_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'transfo-xl': (TransfoXLConfig, TFTransfoXLLMHeadModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'openai-gpt': (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'roberta': (RobertaConfig, TFRobertaForMaskedLM, RobertaForMaskedLM, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP), + "bert": ( + BertConfig, + TFBertForPreTraining, + BertForPreTraining, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "bert-large-uncased-whole-word-masking-finetuned-squad": ( + BertConfig, + TFBertForQuestionAnswering, + BertForQuestionAnswering, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "bert-large-cased-whole-word-masking-finetuned-squad": ( + BertConfig, + TFBertForQuestionAnswering, + BertForQuestionAnswering, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "bert-base-cased-finetuned-mrpc": ( + BertConfig, + TFBertForSequenceClassification, + BertForSequenceClassification, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "gpt2": ( + GPT2Config, + TFGPT2LMHeadModel, + GPT2LMHeadModel, + GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "xlnet": ( + XLNetConfig, + TFXLNetLMHeadModel, + XLNetLMHeadModel, + XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "xlm": ( + XLMConfig, + TFXLMWithLMHeadModel, + XLMWithLMHeadModel, + XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "transfo-xl": ( + TransfoXLConfig, + TFTransfoXLLMHeadModel, + TransfoXLLMHeadModel, + TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "openai-gpt": ( + OpenAIGPTConfig, + TFOpenAIGPTLMHeadModel, + OpenAIGPTLMHeadModel, + OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "roberta": ( + RobertaConfig, + TFRobertaForMaskedLM, + RobertaForMaskedLM, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "roberta-large-mnli": ( + RobertaConfig, + TFRobertaForSequenceClassification, + RobertaForSequenceClassification, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "distilbert": ( + DistilBertConfig, + TFDistilBertForMaskedLM, + DistilBertForMaskedLM, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "distilbert-base-uncased-distilled-squad": ( + DistilBertConfig, + TFDistilBertForQuestionAnswering, + DistilBertForQuestionAnswering, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "distilbert-base-uncased-distilled-squad": ( + DistilBertConfig, + TFDistilBertForQuestionAnswering, + DistilBertForQuestionAnswering, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "ctrl": ( + CTRLConfig, + TFCTRLLMHeadModel, + CTRLLMHeadModel, + CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "albert": ( + AlbertConfig, + TFAlbertForMaskedLM, + AlbertForMaskedLM, + ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), + "t5": ( + T5Config, + TFT5WithLMHeadModel, + T5WithLMHeadModel, + T5_PRETRAINED_MODEL_ARCHIVE_MAP, + T5_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), } -def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True): + +def convert_pt_checkpoint_to_tf( + model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True +): if model_type not in MODEL_CLASSES: raise ValueError("Unrecognized model type, should be one of {}.".format(list(MODEL_CLASSES.keys()))) @@ -116,17 +304,19 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file # Load weights from tf checkpoint if pytorch_checkpoint_path in aws_model_maps: - pytorch_checkpoint_path = cached_path(aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models) + pytorch_checkpoint_path = cached_path( + aws_model_maps[pytorch_checkpoint_path], force_download=not use_cached_models + ) # Load PyTorch checkpoint in tf2 model: tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) if compare_with_pt_model: tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network - state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu') - pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None, - config=config, - state_dict=state_dict) + state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu") + pt_model = pt_model_class.from_pretrained( + pretrained_model_name_or_path=None, config=config, state_dict=state_dict + ) with torch.no_grad(): pto = pt_model(**pt_model.dummy_inputs) @@ -139,11 +329,19 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file # Save pytorch-model print("Save TensorFlow model to {}".format(tf_dump_path)) - tf_model.save_weights(tf_dump_path, save_format='h5') + tf_model.save_weights(tf_dump_path, save_format="h5") -def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None, - compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False): +def convert_all_pt_checkpoints_to_tf( + args_model_type, + tf_dump_path, + model_shortcut_names_or_path=None, + config_shortcut_names_or_path=None, + compare_with_pt_model=False, + use_cached_models=False, + remove_cached_files=False, + only_convert_finetuned_models=False, +): assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory" if args_model_type is None: @@ -156,7 +354,9 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc print(" Converting model type {}/{}: {}".format(j, len(model_types), model_type)) print("=" * 100) if model_type not in MODEL_CLASSES: - raise ValueError("Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys()))) + raise ValueError( + "Unrecognized model type {}, should be one of {}.".format(model_type, list(MODEL_CLASSES.keys())) + ) config_class, model_class, pt_model_class, aws_model_maps, aws_config_map = MODEL_CLASSES[model_type] @@ -166,9 +366,10 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc config_shortcut_names_or_path = model_shortcut_names_or_path for i, (model_shortcut_name, config_shortcut_name) in enumerate( - zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1): + zip(model_shortcut_names_or_path, config_shortcut_names_or_path), start=1 + ): print("-" * 100) - if '-squad' in model_shortcut_name or '-mrpc' in model_shortcut_name or '-mnli' in model_shortcut_name: + if "-squad" in model_shortcut_name or "-mrpc" in model_shortcut_name or "-mnli" in model_shortcut_name: if not only_convert_finetuned_models: print(" Skipping finetuned checkpoint {}".format(model_shortcut_name)) continue @@ -176,7 +377,11 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc elif only_convert_finetuned_models: print(" Skipping not finetuned checkpoint {}".format(model_shortcut_name)) continue - print(" Converting checkpoint {}/{}: {} - model_type {}".format(i, len(aws_config_map), model_shortcut_name, model_type)) + print( + " Converting checkpoint {}/{}: {} - model_type {}".format( + i, len(aws_config_map), model_shortcut_name, model_type + ) + ) print("-" * 100) if config_shortcut_name in aws_config_map: @@ -190,13 +395,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc model_file = cached_path(model_shortcut_name, force_download=not use_cached_models) if os.path.isfile(model_shortcut_name): - model_shortcut_name = 'converted_model' + model_shortcut_name = "converted_model" - convert_pt_checkpoint_to_tf(model_type=model_type, - pytorch_checkpoint_path=model_file, - config_file=config_file, - tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'), - compare_with_pt_model=compare_with_pt_model) + convert_pt_checkpoint_to_tf( + model_type=model_type, + pytorch_checkpoint_path=model_file, + config_file=config_file, + tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + "-tf_model.h5"), + compare_with_pt_model=compare_with_pt_model, + ) if remove_cached_files: os.remove(config_file) os.remove(model_file) @@ -205,39 +412,47 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--tf_dump_path", - default = None, - type = str, - required = True, - help = "Path to the output Tensorflow dump file.") - parser.add_argument("--model_type", - default = None, - type = str, - help = "Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format(list(MODEL_CLASSES.keys()))) - parser.add_argument("--pytorch_checkpoint_path", - default = None, - type = str, - help = "Path to the PyTorch checkpoint path or shortcut name to download from AWS. " - "If not given, will download and convert all the checkpoints from AWS.") - parser.add_argument("--config_file", - default = None, - type = str, - help = "The config json file corresponding to the pre-trained model. \n" - "This specifies the model architecture. If not given and " - "--pytorch_checkpoint_path is not given or is a shortcut name" - "use the configuration associated to the shortcut name on the AWS") - parser.add_argument("--compare_with_pt_model", - action='store_true', - help = "Compare Tensorflow and PyTorch model predictions.") - parser.add_argument("--use_cached_models", - action='store_true', - help = "Use cached models if possible instead of updating to latest checkpoint versions.") - parser.add_argument("--remove_cached_files", - action='store_true', - help = "Remove pytorch models after conversion (save memory when converting in batches).") - parser.add_argument("--only_convert_finetuned_models", - action='store_true', - help = "Only convert finetuned models.") + parser.add_argument( + "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file." + ) + parser.add_argument( + "--model_type", + default=None, + type=str, + help="Model type selected in the list of {}. If not given, will download and convert all the models from AWS.".format( + list(MODEL_CLASSES.keys()) + ), + ) + parser.add_argument( + "--pytorch_checkpoint_path", + default=None, + type=str, + help="Path to the PyTorch checkpoint path or shortcut name to download from AWS. " + "If not given, will download and convert all the checkpoints from AWS.", + ) + parser.add_argument( + "--config_file", + default=None, + type=str, + help="The config json file corresponding to the pre-trained model. \n" + "This specifies the model architecture. If not given and " + "--pytorch_checkpoint_path is not given or is a shortcut name" + "use the configuration associated to the shortcut name on the AWS", + ) + parser.add_argument( + "--compare_with_pt_model", action="store_true", help="Compare Tensorflow and PyTorch model predictions." + ) + parser.add_argument( + "--use_cached_models", + action="store_true", + help="Use cached models if possible instead of updating to latest checkpoint versions.", + ) + parser.add_argument( + "--remove_cached_files", + action="store_true", + help="Remove pytorch models after conversion (save memory when converting in batches).", + ) + parser.add_argument("--only_convert_finetuned_models", action="store_true", help="Only convert finetuned models.") args = parser.parse_args() # if args.pytorch_checkpoint_path is not None: @@ -248,11 +463,15 @@ if __name__ == "__main__": # compare_with_pt_model=args.compare_with_pt_model, # use_cached_models=args.use_cached_models) # else: - convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None, - args.tf_dump_path, - model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None, - config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None, - compare_with_pt_model=args.compare_with_pt_model, - use_cached_models=args.use_cached_models, - remove_cached_files=args.remove_cached_files, - only_convert_finetuned_models=args.only_convert_finetuned_models) + convert_all_pt_checkpoints_to_tf( + args.model_type.lower() if args.model_type is not None else None, + args.tf_dump_path, + model_shortcut_names_or_path=[args.pytorch_checkpoint_path] + if args.pytorch_checkpoint_path is not None + else None, + config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None, + compare_with_pt_model=args.compare_with_pt_model, + use_cached_models=args.use_cached_models, + remove_cached_files=args.remove_cached_files, + only_convert_finetuned_models=args.only_convert_finetuned_models, + ) diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py index fedfc1ecb8..3dec4882ff 100644 --- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -30,20 +30,27 @@ if version.parse(fairseq.__version__) < version.parse("0.9.0"): from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.modules import TransformerSentenceEncoderLayer -from transformers.modeling_bert import (BertConfig, BertEncoder, - BertIntermediate, BertLayer, - BertModel, BertOutput, - BertSelfAttention, - BertSelfOutput) -from transformers.modeling_roberta import (RobertaEmbeddings, - RobertaForMaskedLM, - RobertaForSequenceClassification, - RobertaModel) +from transformers.modeling_bert import ( + BertConfig, + BertEncoder, + BertIntermediate, + BertLayer, + BertModel, + BertOutput, + BertSelfAttention, + BertSelfOutput, +) +from transformers.modeling_roberta import ( + RobertaEmbeddings, + RobertaForMaskedLM, + RobertaForSequenceClassification, + RobertaModel, +) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -SAMPLE_TEXT = 'Hello world! cécé herlolip' +SAMPLE_TEXT = "Hello world! cécé herlolip" def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head): @@ -61,7 +68,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, - layer_norm_eps=1e-5, # PyTorch default used in fairseq + layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.args.num_classes @@ -74,7 +81,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ # Embeddings model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight - model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. + model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( + model.roberta.embeddings.token_type_embeddings.weight + ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias @@ -85,11 +94,11 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ### self attention self_attn: BertSelfAttention = layer.attention.self - assert( - roberta_layer.self_attn.k_proj.weight.data.shape == \ - roberta_layer.self_attn.q_proj.weight.data.shape == \ - roberta_layer.self_attn.v_proj.weight.data.shape == \ - torch.Size((config.hidden_size, config.hidden_size)) + assert ( + roberta_layer.self_attn.k_proj.weight.data.shape + == roberta_layer.self_attn.q_proj.weight.data.shape + == roberta_layer.self_attn.v_proj.weight.data.shape + == torch.Size((config.hidden_size, config.hidden_size)) ) self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight @@ -101,9 +110,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ### self-attention output self_output: BertSelfOutput = layer.attention.output - assert( - self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape - ) + assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight @@ -111,28 +118,24 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ### intermediate intermediate: BertIntermediate = layer.intermediate - assert( - intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape - ) + assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias ### output bert_output: BertOutput = layer.output - assert( - bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape - ) + assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias #### end of layer - + if classification_head: - model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight - model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias - model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight - model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias + model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight + model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias + model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight + model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight @@ -143,21 +146,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ model.lm_head.bias = roberta.model.decoder.lm_head.bias # Let's check that we get the same results. - input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 + input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: - their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids)) + their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 + print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) - print( - "Do both models output the same tensors?", - "🔥" if success else "💩" - ) + print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") @@ -169,23 +169,16 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--roberta_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path the official PyTorch dump.") - parser.add_argument("--pytorch_dump_folder_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") - parser.add_argument("--classification_head", - action = "store_true", - help = "Whether to convert a final classification head.") + parser.add_argument( + "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--classification_head", action="store_true", help="Whether to convert a final classification head." + ) args = parser.parse_args() convert_roberta_checkpoint_to_pytorch( - args.roberta_checkpoint_path, - args.pytorch_dump_folder_path, - args.classification_head + args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head ) - diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py index 2b74d2dd93..0b22a5f9c6 100755 --- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py @@ -24,8 +24,10 @@ import torch from transformers import T5Config, T5Model, load_tf_weights_in_t5 import logging + logging.basicConfig(level=logging.INFO) + def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model config = T5Config.from_json_file(config_file) @@ -43,23 +45,19 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--tf_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--config_file", - default = None, - type = str, - required = True, - help = "The config json file corresponding to the pre-trained T5 model. \n" - "This specifies the model architecture.") - parser.add_argument("--pytorch_dump_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the pre-trained T5 model. \n" + "This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.config_file, - args.pytorch_dump_path) + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py index a5ff4ed22c..f8dd45ae55 100755 --- a/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py @@ -26,9 +26,8 @@ import torch import transformers.tokenization_transfo_xl as data_utils from transformers import CONFIG_NAME, WEIGHTS_NAME -from transformers import (TransfoXLConfig, TransfoXLLMHeadModel, - load_tf_weights_in_transfo_xl) -from transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES) +from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl +from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES if sys.version_info[0] == 2: import cPickle as pickle @@ -36,32 +35,33 @@ else: import pickle import logging + logging.basicConfig(level=logging.INFO) # We do this to be able to load python 2 datasets pickles # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 data_utils.Vocab = data_utils.TransfoXLTokenizer data_utils.Corpus = data_utils.TransfoXLCorpus -sys.modules['data_utils'] = data_utils -sys.modules['vocabulary'] = data_utils +sys.modules["data_utils"] = data_utils +sys.modules["vocabulary"] = data_utils -def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, - transfo_xl_config_file, - pytorch_dump_folder_path, - transfo_xl_dataset_file): + +def convert_transfo_xl_checkpoint_to_pytorch( + tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file +): if transfo_xl_dataset_file: # Convert a pre-processed corpus (see original TensorFlow repo) with open(transfo_xl_dataset_file, "rb") as fp: corpus = pickle.load(fp, encoding="latin1") # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) - pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file'] + pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) corpus_vocab_dict = corpus.vocab.__dict__ torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) corpus_dict_no_vocab = corpus.__dict__ - corpus_dict_no_vocab.pop('vocab', None) - pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME + corpus_dict_no_vocab.pop("vocab", None) + pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME print("Save dataset to {}".format(pytorch_dataset_dump_path)) torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) @@ -92,26 +92,36 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", - default = None, - type = str, - required = True, - help = "Path to the folder to store the PyTorch model or dataset/vocab.") - parser.add_argument("--tf_checkpoint_path", - default = "", - type = str, - help = "An optional path to a TensorFlow checkpoint path to be converted.") - parser.add_argument("--transfo_xl_config_file", - default = "", - type = str, - help = "An optional config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture.") - parser.add_argument("--transfo_xl_dataset_file", - default = "", - type = str, - help = "An optional dataset file to be converted in a vocabulary.") + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + required=True, + help="Path to the folder to store the PyTorch model or dataset/vocab.", + ) + parser.add_argument( + "--tf_checkpoint_path", + default="", + type=str, + help="An optional path to a TensorFlow checkpoint path to be converted.", + ) + parser.add_argument( + "--transfo_xl_config_file", + default="", + type=str, + help="An optional config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.", + ) + parser.add_argument( + "--transfo_xl_dataset_file", + default="", + type=str, + help="An optional dataset file to be converted in a vocabulary.", + ) args = parser.parse_args() - convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.transfo_xl_config_file, - args.pytorch_dump_folder_path, - args.transfo_xl_dataset_file) + convert_transfo_xl_checkpoint_to_pytorch( + args.tf_checkpoint_path, + args.transfo_xl_config_file, + args.pytorch_dump_folder_path, + args.transfo_xl_dataset_file, + ) diff --git a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py index 91133ef56a..7cbf9cae95 100755 --- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py @@ -27,32 +27,34 @@ from transformers import CONFIG_NAME, WEIGHTS_NAME from transformers.tokenization_xlm import VOCAB_FILES_NAMES import logging + logging.basicConfig(level=logging.INFO) + def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): # Load checkpoint - chkpt = torch.load(xlm_checkpoint_path, map_location='cpu') + chkpt = torch.load(xlm_checkpoint_path, map_location="cpu") - state_dict = chkpt['model'] + state_dict = chkpt["model"] # We have the base model one level deeper than the original XLM repository two_levels_state_dict = {} for k, v in state_dict.items(): - if 'pred_layer' in k: + if "pred_layer" in k: two_levels_state_dict[k] = v else: - two_levels_state_dict['transformer.' + k] = v + two_levels_state_dict["transformer." + k] = v - config = chkpt['params'] + config = chkpt["params"] config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) - vocab = chkpt['dico_word2id'] - vocab = dict((s + '' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items()) + vocab = chkpt["dico_word2id"] + vocab = dict((s + "" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""), i) for s, i in vocab.items()) # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME - pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file'] + pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME + pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME + pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"] print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) torch.save(two_levels_state_dict, pytorch_weights_dump_path) @@ -69,15 +71,11 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--xlm_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path the official PyTorch dump.") - parser.add_argument("--pytorch_dump_folder_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") + parser.add_argument( + "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) args = parser.parse_args() convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py index 3669d9944c..83688cf07b 100755 --- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py @@ -22,11 +22,15 @@ import os import argparse import torch -from transformers import (CONFIG_NAME, WEIGHTS_NAME, - XLNetConfig, - XLNetLMHeadModel, XLNetForQuestionAnswering, - XLNetForSequenceClassification, - load_tf_weights_in_xlnet) +from transformers import ( + CONFIG_NAME, + WEIGHTS_NAME, + XLNetConfig, + XLNetLMHeadModel, + XLNetForQuestionAnswering, + XLNetForSequenceClassification, + load_tf_weights_in_xlnet, +) GLUE_TASKS_NUM_LABELS = { "cola": 2, @@ -41,9 +45,13 @@ GLUE_TASKS_NUM_LABELS = { } import logging + logging.basicConfig(level=logging.INFO) -def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None): + +def convert_xlnet_checkpoint_to_pytorch( + tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None +): # Initialise PyTorch model config = XLNetConfig.from_json_file(bert_config_file) @@ -53,7 +61,7 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py config.finetuning_task = finetuning_task config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] model = XLNetForSequenceClassification(config) - elif 'squad' in finetuning_task: + elif "squad" in finetuning_task: config.finetuning_task = finetuning_task model = XLNetForQuestionAnswering(config) else: @@ -75,30 +83,33 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py if __name__ == "__main__": parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--tf_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--xlnet_config_file", - default = None, - type = str, - required = True, - help = "The config json file corresponding to the pre-trained XLNet model. \n" - "This specifies the model architecture.") - parser.add_argument("--pytorch_dump_folder_path", - default = None, - type = str, - required = True, - help = "Path to the folder to store the PyTorch model or dataset/vocab.") - parser.add_argument("--finetuning_task", - default = None, - type = str, - help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned") + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--xlnet_config_file", + default=None, + type=str, + required=True, + help="The config json file corresponding to the pre-trained XLNet model. \n" + "This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + type=str, + required=True, + help="Path to the folder to store the PyTorch model or dataset/vocab.", + ) + parser.add_argument( + "--finetuning_task", + default=None, + type=str, + help="Name of a task on which the XLNet TensorFloaw model was fine-tuned", + ) args = parser.parse_args() print(args) - convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.xlnet_config_file, - args.pytorch_dump_folder_path, - args.finetuning_task) + convert_xlnet_checkpoint_to_pytorch( + args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task + ) diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index 5567952fd2..bac6c6e3af 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,8 +1,15 @@ -from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures, SingleSentenceClassificationProcessor +from .processors import ( + InputExample, + InputFeatures, + DataProcessor, + SquadFeatures, + SingleSentenceClassificationProcessor, +) from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels from .metrics import is_sklearn_available + if is_sklearn_available(): from .metrics import glue_compute_metrics, xnli_compute_metrics diff --git a/transformers/data/metrics/__init__.py b/transformers/data/metrics/__init__.py index 5a46eb05d3..bd3b76efc0 100644 --- a/transformers/data/metrics/__init__.py +++ b/transformers/data/metrics/__init__.py @@ -23,20 +23,22 @@ logger = logging.getLogger(__name__) try: from scipy.stats import pearsonr, spearmanr from sklearn.metrics import matthews_corrcoef, f1_score + _has_sklearn = True except (AttributeError, ImportError) as e: logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") _has_sklearn = False + def is_sklearn_available(): return _has_sklearn + if _has_sklearn: def simple_accuracy(preds, labels): return (preds == labels).mean() - def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) @@ -46,7 +48,6 @@ if _has_sklearn: "acc_and_f1": (acc + f1) / 2, } - def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] @@ -56,7 +57,6 @@ if _has_sklearn: "corr": (pearson_corr + spearman_corr) / 2, } - def glue_compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "cola": @@ -82,7 +82,6 @@ if _has_sklearn: else: raise KeyError(task_name) - def xnli_compute_metrics(task_name, preds, labels): assert len(preds) == len(labels) if task_name == "xnli": diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index acbb884fb8..a867fe3fde 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -24,19 +24,21 @@ logger = logging.getLogger(__name__) def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): - regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) - return re.sub(regex, ' ', text) + regex = re.compile(r"\b(a|an|the)\b", re.UNICODE) + return re.sub(regex, " ", text) def white_space_fix(text): - return ' '.join(text.split()) + return " ".join(text.split()) def remove_punc(text): exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) + return "".join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() + return white_space_fix(remove_articles(remove_punc(lower(s)))) @@ -75,14 +77,14 @@ def get_raw_scores(examples, preds): for example in examples: qas_id = example.qas_id - gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])] + gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])] if not gold_answers: # For unanswerable questions, only correct answer is empty string - gold_answers = [''] + gold_answers = [""] if qas_id not in preds: - print('Missing prediction for %s' % qas_id) + print("Missing prediction for %s" % qas_id) continue prediction = preds[qas_id] @@ -106,23 +108,27 @@ def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): def make_eval_dict(exact_scores, f1_scores, qid_list=None): if not qid_list: total = len(exact_scores) - return collections.OrderedDict([ - ('exact', 100.0 * sum(exact_scores.values()) / total), - ('f1', 100.0 * sum(f1_scores.values()) / total), - ('total', total), - ]) + return collections.OrderedDict( + [ + ("exact", 100.0 * sum(exact_scores.values()) / total), + ("f1", 100.0 * sum(f1_scores.values()) / total), + ("total", total), + ] + ) else: total = len(qid_list) - return collections.OrderedDict([ - ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), - ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), - ('total', total), - ]) + return collections.OrderedDict( + [ + ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ("total", total), + ] + ) def merge_eval(main_eval, new_eval, prefix): for k in new_eval: - main_eval['%s_%s' % (prefix, k)] = new_eval[k] + main_eval["%s_%s" % (prefix, k)] = new_eval[k] def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): @@ -160,16 +166,14 @@ def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): - best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2( - preds, exact_raw, na_probs, qid_to_has_ans) - best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2( - preds, f1_raw, na_probs, qid_to_has_ans) - main_eval['best_exact'] = best_exact - main_eval['best_exact_thresh'] = exact_thresh - main_eval['best_f1'] = best_f1 - main_eval['best_f1_thresh'] = f1_thresh - main_eval['has_ans_exact'] = has_ans_exact - main_eval['has_ans_f1'] = has_ans_f1 + best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) + main_eval["best_exact"] = best_exact + main_eval["best_exact_thresh"] = exact_thresh + main_eval["best_f1"] = best_f1 + main_eval["best_f1_thresh"] = f1_thresh + main_eval["has_ans_exact"] = has_ans_exact + main_eval["has_ans_f1"] = has_ans_f1 def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): @@ -199,10 +203,10 @@ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_h best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) - main_eval['best_exact'] = best_exact - main_eval['best_exact_thresh'] = exact_thresh - main_eval['best_f1'] = best_f1 - main_eval['best_f1_thresh'] = f1_thresh + main_eval["best_exact"] = best_exact + main_eval["best_exact_thresh"] = exact_thresh + main_eval["best_f1"] = best_f1 + main_eval["best_f1_thresh"] = f1_thresh def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0): @@ -215,18 +219,20 @@ def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_ exact, f1 = get_raw_scores(examples, preds) - exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) + exact_threshold = apply_no_ans_threshold( + exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold + ) f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) evaluation = make_eval_dict(exact_threshold, f1_threshold) if has_answer_qids: has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids) - merge_eval(evaluation, has_ans_eval, 'HasAns') + merge_eval(evaluation, has_ans_eval, "HasAns") if no_answer_qids: no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids) - merge_eval(evaluation, no_ans_eval, 'NoAns') + merge_eval(evaluation, no_ans_eval, "NoAns") if no_answer_probs: find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer) @@ -284,8 +290,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 @@ -294,8 +299,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text) return orig_text # We then project the characters in `pred_text` back to `orig_text` using @@ -326,7 +330,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): logger.info("Couldn't map end position") return orig_text - output_text = orig_text[orig_start_position:(orig_end_position + 1)] + output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text @@ -393,8 +397,8 @@ def compute_predictions_logits( unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] + ) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() @@ -447,7 +451,9 @@ def compute_predictions_logits( start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) + end_logit=result.end_logits[end_index], + ) + ) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction( @@ -455,14 +461,14 @@ def compute_predictions_logits( start_index=0, end_index=0, start_logit=null_start_logit, - end_logit=null_end_logit)) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) + end_logit=null_end_logit, + ) + ) + prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) + "NbestPrediction", ["text", "start_logit", "end_logit"] + ) seen_predictions = {} nbest = [] @@ -471,10 +477,10 @@ def compute_predictions_logits( break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) @@ -498,31 +504,21 @@ def compute_predictions_logits( final_text = "" seen_predictions[final_text] = True - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) + nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: - nbest.append( - _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) + nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. if len(nbest) == 1: - nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 @@ -551,8 +547,7 @@ def compute_predictions_logits( all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) + score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" @@ -586,7 +581,7 @@ def compute_predictions_log_probs( end_n_top, version_2_with_negative, tokenizer, - verbose_logging + verbose_logging, ): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. @@ -594,12 +589,12 @@ def compute_predictions_log_probs( Requires utils_squad_evaluate.py """ _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"] + ) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"] + ) logger.info("Writing predictions to: %s", output_prediction_file) # logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -663,12 +658,13 @@ def compute_predictions_log_probs( start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, - end_log_prob=end_log_prob)) + end_log_prob=end_log_prob, + ) + ) prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True) + prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True + ) seen_predictions = {} nbest = [] @@ -688,10 +684,10 @@ def compute_predictions_log_probs( # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)] tok_text = tokenizer.convert_tokens_to_string(tok_tokens) # Clean whitespace @@ -704,8 +700,7 @@ def compute_predictions_log_probs( else: do_lower_case = tokenizer.do_lowercase_and_remove_accent - final_text = get_final_text(tok_text, orig_text, do_lower_case, - verbose_logging) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue @@ -713,17 +708,13 @@ def compute_predictions_log_probs( seen_predictions[final_text] = True nbest.append( - _NbestPrediction( - text=final_text, - start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob)) + _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob) + ) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: - nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, - end_log_prob=-1e6)) + nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6)) total_scores = [] best_non_null_entry = None diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 4f7307bb7b..e59e9fbcb2 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor -from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels \ No newline at end of file +from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py index 11ebd949de..f9c0132a71 100644 --- a/transformers/data/processors/glue.py +++ b/transformers/data/processors/glue.py @@ -27,15 +27,18 @@ if is_tf_available(): logger = logging.getLogger(__name__) -def glue_convert_examples_to_features(examples, tokenizer, - max_length=512, - task=None, - label_list=None, - output_mode=None, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - mask_padding_with_zero=True): +def glue_convert_examples_to_features( + examples, + tokenizer, + max_length=512, + task=None, + label_list=None, + output_mode=None, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True, +): """ Loads a data file into a list of ``InputFeatures`` @@ -82,12 +85,7 @@ def glue_convert_examples_to_features(examples, tokenizer, example = processor.get_example_from_tensor_dict(example) example = processor.tfds_map(example) - inputs = tokenizer.encode_plus( - example.text_a, - example.text_b, - add_special_tokens=True, - max_length=max_length, - ) + inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real @@ -106,8 +104,12 @@ def glue_convert_examples_to_features(examples, tokenizer, token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) - assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) - assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format( + len(attention_mask), max_length + ) + assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format( + len(token_type_ids), max_length + ) if output_mode == "classification": label = label_map[example.label] @@ -125,28 +127,36 @@ def glue_convert_examples_to_features(examples, tokenizer, logger.info("label: %s (id = %d)" % (example.label, label)) features.append( - InputFeatures(input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - label=label)) + InputFeatures( + input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label + ) + ) if is_tf_available() and is_tf_dataset: + def gen(): for ex in features: - yield ({'input_ids': ex.input_ids, - 'attention_mask': ex.attention_mask, - 'token_type_ids': ex.token_type_ids}, - ex.label) + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + }, + ex.label, + ) - return tf.data.Dataset.from_generator(gen, - ({'input_ids': tf.int32, - 'attention_mask': tf.int32, - 'token_type_ids': tf.int32}, - tf.int64), - ({'input_ids': tf.TensorShape([None]), - 'attention_mask': tf.TensorShape([None]), - 'token_type_ids': tf.TensorShape([None])}, - tf.TensorShape([]))) + return tf.data.Dataset.from_generator( + gen, + ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64), + ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "token_type_ids": tf.TensorShape([None]), + }, + tf.TensorShape([]), + ), + ) return features @@ -156,21 +166,21 @@ class MrpcProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['sentence1'].numpy().decode('utf-8'), - tensor_dict['sentence2'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_labels(self): """See base class.""" @@ -186,8 +196,7 @@ class MrpcProcessor(DataProcessor): text_a = line[3] text_b = line[4] label = line[0] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -196,21 +205,20 @@ class MnliProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['premise'].numpy().decode('utf-8'), - tensor_dict['hypothesis'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["premise"].numpy().decode("utf-8"), + tensor_dict["hypothesis"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), - "dev_matched") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") def get_labels(self): """See base class.""" @@ -226,8 +234,7 @@ class MnliProcessor(DataProcessor): text_a = line[8] text_b = line[9] label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -236,9 +243,7 @@ class MnliMismatchedProcessor(MnliProcessor): def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), - "dev_matched") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") class ColaProcessor(DataProcessor): @@ -246,20 +251,20 @@ class ColaProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['sentence'].numpy().decode('utf-8'), - None, - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence"].numpy().decode("utf-8"), + None, + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_labels(self): """See base class.""" @@ -272,8 +277,7 @@ class ColaProcessor(DataProcessor): guid = "%s-%s" % (set_type, i) text_a = line[3] label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples @@ -282,20 +286,20 @@ class Sst2Processor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['sentence'].numpy().decode('utf-8'), - None, - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence"].numpy().decode("utf-8"), + None, + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_labels(self): """See base class.""" @@ -310,8 +314,7 @@ class Sst2Processor(DataProcessor): guid = "%s-%s" % (set_type, i) text_a = line[0] label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples @@ -320,20 +323,20 @@ class StsbProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['sentence1'].numpy().decode('utf-8'), - tensor_dict['sentence2'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_labels(self): """See base class.""" @@ -349,8 +352,7 @@ class StsbProcessor(DataProcessor): text_a = line[7] text_b = line[8] label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -359,20 +361,20 @@ class QqpProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['question1'].numpy().decode('utf-8'), - tensor_dict['question2'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["question1"].numpy().decode("utf-8"), + tensor_dict["question2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_labels(self): """See base class.""" @@ -391,8 +393,7 @@ class QqpProcessor(DataProcessor): label = line[5] except IndexError: continue - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -401,21 +402,20 @@ class QnliProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['question'].numpy().decode('utf-8'), - tensor_dict['sentence'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["question"].numpy().decode("utf-8"), + tensor_dict["sentence"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), - "dev_matched") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") def get_labels(self): """See base class.""" @@ -431,8 +431,7 @@ class QnliProcessor(DataProcessor): text_a = line[1] text_b = line[2] label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -441,20 +440,20 @@ class RteProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['sentence1'].numpy().decode('utf-8'), - tensor_dict['sentence2'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_labels(self): """See base class.""" @@ -470,8 +469,7 @@ class RteProcessor(DataProcessor): text_a = line[1] text_b = line[2] label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -480,20 +478,20 @@ class WnliProcessor(DataProcessor): def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" - return InputExample(tensor_dict['idx'].numpy(), - tensor_dict['sentence1'].numpy().decode('utf-8'), - tensor_dict['sentence2'].numpy().decode('utf-8'), - str(tensor_dict['label'].numpy())) + return InputExample( + tensor_dict["idx"].numpy(), + tensor_dict["sentence1"].numpy().decode("utf-8"), + tensor_dict["sentence2"].numpy().decode("utf-8"), + str(tensor_dict["label"].numpy()), + ) def get_train_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") def get_labels(self): """See base class.""" @@ -509,10 +507,10 @@ class WnliProcessor(DataProcessor): text_a = line[1] text_b = line[2] label = line[-1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples + glue_tasks_num_labels = { "cola": 2, "mnli": 3, diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index fd5150e93f..efb10830bd 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -82,8 +82,8 @@ def _is_whitespace(c): return True return False -def squad_convert_example_to_features(example, max_seq_length, - doc_stride, max_query_length, is_training): + +def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training): features = [] if is_training and not example.is_impossible: # Get start and end position @@ -91,7 +91,7 @@ def squad_convert_example_to_features(example, max_seq_length, end_position = example.end_position # If the answer cannot be found in the text, then skip this example. - actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) + actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) @@ -121,8 +121,11 @@ def squad_convert_example_to_features(example, max_seq_length, spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) - sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \ - if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_added_tokens = ( + tokenizer.max_len - tokenizer.max_len_single_sentence + 1 + if "roberta" in str(type(tokenizer)) + else tokenizer.max_len - tokenizer.max_len_single_sentence + ) sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens @@ -135,16 +138,18 @@ def squad_convert_example_to_features(example, max_seq_length, return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' + truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", ) - paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, - max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + paragraph_len = min( + len(all_doc_tokens) - len(spans) * doc_stride, + max_seq_length - len(truncated_query) - sequence_pair_added_tokens, + ) - if tokenizer.pad_token_id in encoded_dict['input_ids']: - non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + if tokenizer.pad_token_id in encoded_dict["input_ids"]: + non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] else: - non_padded_ids = encoded_dict['input_ids'] + non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) @@ -170,17 +175,20 @@ def squad_convert_example_to_features(example, max_seq_length, for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = j if tokenizer.padding_side == "left" else spans[doc_span_index][ - "truncated_query_with_special_tokens_length"] + j + index = ( + j + if tokenizer.padding_side == "left" + else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + ) spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: # Identify the position of the CLS token - cls_index = span['input_ids'].index(tokenizer.cls_token_id) + cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = np.array(span['token_type_ids']) + p_mask = np.array(span["token_type_ids"]) p_mask = np.minimum(p_mask, 1) @@ -219,31 +227,34 @@ def squad_convert_example_to_features(example, max_seq_length, start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset - features.append(SquadFeatures( - span['input_ids'], - span['attention_mask'], - span['token_type_ids'], - cls_index, - p_mask.tolist(), - example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. - unique_id=0, - paragraph_len=span['paragraph_len'], - token_is_max_context=span["token_is_max_context"], - tokens=span["tokens"], - token_to_orig_map=span["token_to_orig_map"], - - start_position=start_position, - end_position=end_position - )) + features.append( + SquadFeatures( + span["input_ids"], + span["attention_mask"], + span["token_type_ids"], + cls_index, + p_mask.tolist(), + example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. + unique_id=0, + paragraph_len=span["paragraph_len"], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"], + start_position=start_position, + end_position=end_position, + ) + ) return features + def squad_convert_example_to_features_init(tokenizer_for_convert): global tokenizer tokenizer = tokenizer_for_convert -def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - return_dataset=False, threads=1): + +def squad_convert_examples_to_features( + examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1 +): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. @@ -279,17 +290,28 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ) """ - # Defining helper methods + # Defining helper methods features = [] threads = min(threads, cpu_count()) with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: - annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length, - doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training) - features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features')) + annotate_ = partial( + squad_convert_example_to_features, + max_seq_length=max_seq_length, + doc_stride=doc_stride, + max_query_length=max_query_length, + is_training=is_training, + ) + features = list( + tqdm( + p.imap(annotate_, examples, chunksize=32), + total=len(examples), + desc="convert squad examples to features", + ) + ) new_features = [] unique_id = 1000000000 example_index = 0 - for example_features in tqdm(features, total=len(features), desc='add example index and unique id'): + for example_features in tqdm(features, total=len(features), desc="add example index and unique id"): if not example_features: continue for example_feature in example_features: @@ -300,7 +322,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, example_index += 1 features = new_features del new_features - if return_dataset == 'pt': + if return_dataset == "pt": if not is_torch_available(): raise ImportError("Pytorch must be installed to return a pytorch dataset.") @@ -341,12 +363,13 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, "input_ids": ex.input_ids, "attention_mask": ex.attention_mask, "token_type_ids": ex.token_type_ids, - }, { + }, + { "start_position": ex.start_position, "end_position": ex.end_position, "cls_index": ex.cls_index, "p_mask": ex.p_mask, - } + }, ) return tf.data.Dataset.from_generator( diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py index ee234e6e90..41cc00d4bd 100644 --- a/transformers/data/processors/utils.py +++ b/transformers/data/processors/utils.py @@ -24,6 +24,7 @@ from ...file_utils import is_tf_available, is_torch_available logger = logging.getLogger(__name__) + class InputExample(object): """ A single training/test example for simple sequence classification. @@ -37,6 +38,7 @@ class InputExample(object): label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples. """ + def __init__(self, guid, text_a, text_b=None, label=None): self.guid = guid self.text_a = text_a @@ -99,14 +101,15 @@ class DataProcessor(object): lines = [] for line in reader: if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) + line = list(unicode(cell, "utf-8") for cell in line) lines.append(line) return lines class SingleSentenceClassificationProcessor(DataProcessor): """ Generic processor for a single sentence classification data set.""" - def __init__(self, labels=None, examples=None, mode='classification', verbose=False): + + def __init__(self, labels=None, examples=None, mode="classification", verbose=False): self.labels = [] if labels is None else labels self.examples = [] if examples is None else examples self.mode = mode @@ -117,22 +120,24 @@ class SingleSentenceClassificationProcessor(DataProcessor): def __getitem__(self, idx): if isinstance(idx, slice): - return SingleSentenceClassificationProcessor(labels=self.labels, - examples=self.examples[idx]) + return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx]) return self.examples[idx] @classmethod - def create_from_csv(cls, file_name, split_name='', column_label=0, column_text=1, - column_id=None, skip_first_row=False, **kwargs): + def create_from_csv( + cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs + ): processor = cls(**kwargs) - processor.add_examples_from_csv(file_name, - split_name=split_name, - column_label=column_label, - column_text=column_text, - column_id=column_id, - skip_first_row=skip_first_row, - overwrite_labels=True, - overwrite_examples=True) + processor.add_examples_from_csv( + file_name, + split_name=split_name, + column_label=column_label, + column_text=column_text, + column_id=column_id, + skip_first_row=skip_first_row, + overwrite_labels=True, + overwrite_examples=True, + ) return processor @classmethod @@ -141,8 +146,17 @@ class SingleSentenceClassificationProcessor(DataProcessor): processor.add_examples(texts_or_text_and_labels, labels=labels) return processor - def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None, - skip_first_row=False, overwrite_labels=False, overwrite_examples=False): + def add_examples_from_csv( + self, + file_name, + split_name="", + column_label=0, + column_text=1, + column_id=None, + skip_first_row=False, + overwrite_labels=False, + overwrite_examples=False, + ): lines = self._read_tsv(file_name) if skip_first_row: lines = lines[1:] @@ -158,10 +172,13 @@ class SingleSentenceClassificationProcessor(DataProcessor): guid = "%s-%s" % (split_name, i) if split_name else "%s" % i ids.append(guid) - return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples) + return self.add_examples( + texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples + ) - def add_examples(self, texts_or_text_and_labels, labels=None, ids=None, - overwrite_labels=False, overwrite_examples=False): + def add_examples( + self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False + ): assert labels is None or len(texts_or_text_and_labels) == len(labels) assert ids is None or len(texts_or_text_and_labels) == len(ids) if ids is None: @@ -192,13 +209,15 @@ class SingleSentenceClassificationProcessor(DataProcessor): return self.examples - def get_features(self, - tokenizer, - max_length=None, - pad_on_left=False, - pad_token=0, - mask_padding_with_zero=True, - return_tensors=None): + def get_features( + self, + tokenizer, + max_length=None, + pad_on_left=False, + pad_token=0, + mask_padding_with_zero=True, + return_tensors=None, + ): """ Convert examples in a list of ``InputFeatures`` @@ -231,9 +250,7 @@ class SingleSentenceClassificationProcessor(DataProcessor): logger.info("Tokenizing example %d", ex_index) input_ids = tokenizer.encode( - example.text_a, - add_special_tokens=True, - max_length=min(max_length, tokenizer.max_len), + example.text_a, add_special_tokens=True, max_length=min(max_length, tokenizer.max_len), ) all_input_ids.append(input_ids) @@ -256,8 +273,12 @@ class SingleSentenceClassificationProcessor(DataProcessor): input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(len(input_ids), batch_length) - assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(len(attention_mask), batch_length) + assert len(input_ids) == batch_length, "Error with input length {} vs {}".format( + len(input_ids), batch_length + ) + assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format( + len(attention_mask), batch_length + ) if self.mode == "classification": label = label_map[example.label] @@ -273,36 +294,31 @@ class SingleSentenceClassificationProcessor(DataProcessor): logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("label: %s (id = %d)" % (example.label, label)) - features.append( - InputFeatures(input_ids=input_ids, - attention_mask=attention_mask, - label=label)) + features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label)) if return_tensors is None: return features - elif return_tensors == 'tf': + elif return_tensors == "tf": if not is_tf_available(): raise ImportError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported") import tensorflow as tf + def gen(): for ex in features: - yield ({'input_ids': ex.input_ids, - 'attention_mask': ex.attention_mask}, - ex.label) + yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label) - dataset = tf.data.Dataset.from_generator(gen, - ({'input_ids': tf.int32, - 'attention_mask': tf.int32}, - tf.int64), - ({'input_ids': tf.TensorShape([None]), - 'attention_mask': tf.TensorShape([None])}, - tf.TensorShape([]))) + dataset = tf.data.Dataset.from_generator( + gen, + ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64), + ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])), + ) return dataset - elif return_tensors == 'pt': + elif return_tensors == "pt": if not is_torch_available(): raise ImportError("return_tensors set to 'pt' but PyTorch can't be imported") import torch from torch.utils.data import TensorDataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) if self.mode == "classification": diff --git a/transformers/data/processors/xnli.py b/transformers/data/processors/xnli.py index 958bdf62f9..ffe0358c1e 100644 --- a/transformers/data/processors/xnli.py +++ b/transformers/data/processors/xnli.py @@ -24,11 +24,12 @@ from .utils import DataProcessor, InputExample logger = logging.getLogger(__name__) + class XnliProcessor(DataProcessor): """Processor for the XNLI dataset. Adapted from https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207""" - def __init__(self, language, train_language = None): + def __init__(self, language, train_language=None): self.language = language self.train_language = train_language @@ -40,13 +41,12 @@ class XnliProcessor(DataProcessor): for (i, line) in enumerate(lines): if i == 0: continue - guid = "%s-%s" % ('train', i) + guid = "%s-%s" % ("train", i) text_a = line[0] text_b = line[1] label = "contradiction" if line[2] == "contradictory" else line[2] assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples def get_test_examples(self, data_dir): @@ -59,19 +59,19 @@ class XnliProcessor(DataProcessor): language = line[0] if language != self.language: continue - guid = "%s-%s" % ('test', i) + guid = "%s-%s" % ("test", i) text_a = line[6] text_b = line[7] label = line[1] assert isinstance(text_a, str) and isinstance(text_b, str) and isinstance(label, str) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples def get_labels(self): """See base class.""" return ["contradiction", "entailment", "neutral"] + xnli_processors = { "xnli": XnliProcessor, } diff --git a/transformers/file_utils.py b/transformers/file_utils.py index ec925c6160..c45bdee04a 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -3,7 +3,7 @@ Utilities for working with the local dataset cache. This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp Copyright by the AllenNLP authors. """ -from __future__ import (absolute_import, division, print_function, unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import sys import json @@ -29,9 +29,10 @@ from filelock import FileLock logger = logging.getLogger(__name__) # pylint: disable=invalid-name try: - os.environ.setdefault('USE_TORCH', 'YES') - if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'): + os.environ.setdefault("USE_TORCH", "YES") + if os.environ["USE_TORCH"].upper() in ("1", "ON", "YES"): import torch + _torch_available = True # pylint: disable=invalid-name logger.info("PyTorch version {} available.".format(torch.__version__)) else: @@ -41,10 +42,11 @@ except ImportError: _torch_available = False # pylint: disable=invalid-name try: - os.environ.setdefault('USE_TF', 'YES') - if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'): + os.environ.setdefault("USE_TF", "YES") + if os.environ["USE_TF"].upper() in ("1", "ON", "YES"): import tensorflow as tf - assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 + + assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2 _tf_available = True # pylint: disable=invalid-name logger.info("TensorFlow version {} available.".format(tf.__version__)) else: @@ -55,12 +57,13 @@ except (ImportError, AssertionError): try: from torch.hub import _get_torch_home + torch_cache_home = _get_torch_home() except ImportError: torch_cache_home = os.path.expanduser( - os.getenv('TORCH_HOME', os.path.join( - os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) -default_cache_path = os.path.join(torch_cache_home, 'transformers') + os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch")) + ) +default_cache_path = os.path.join(torch_cache_home, "transformers") try: from urllib.parse import urlparse @@ -69,19 +72,21 @@ except ImportError: try: from pathlib import Path + PYTORCH_PRETRAINED_BERT_CACHE = Path( - os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))) + os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)) + ) except (AttributeError, ImportError): - PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE', - os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', - default_cache_path)) + PYTORCH_PRETRAINED_BERT_CACHE = os.getenv( + "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path) + ) PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility WEIGHTS_NAME = "pytorch_model.bin" -TF2_WEIGHTS_NAME = 'tf_model.h5' -TF_WEIGHTS_NAME = 'model.ckpt' +TF2_WEIGHTS_NAME = "tf_model.h5" +TF_WEIGHTS_NAME = "model.ckpt" CONFIG_NAME = "config.json" MODEL_CARD_NAME = "modelcard.json" @@ -95,38 +100,48 @@ CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net" def is_torch_available(): return _torch_available + def is_tf_available(): return _tf_available + if not six.PY2: + def add_start_docstrings(*docstr): def docstring_decorator(fn): - fn.__doc__ = ''.join(docstr) + fn.__doc__ + fn.__doc__ = "".join(docstr) + fn.__doc__ return fn + return docstring_decorator def add_end_docstrings(*docstr): def docstring_decorator(fn): - fn.__doc__ = fn.__doc__ + ''.join(docstr) + fn.__doc__ = fn.__doc__ + "".join(docstr) return fn + return docstring_decorator + + else: # Not possible to update class docstrings on python2 def add_start_docstrings(*docstr): def docstring_decorator(fn): return fn + return docstring_decorator def add_end_docstrings(*docstr): def docstring_decorator(fn): return fn + return docstring_decorator def is_remote_url(url_or_filename): parsed = urlparse(url_or_filename) - return parsed.scheme in ('http', 'https', 's3') + return parsed.scheme in ("http", "https", "s3") + def hf_bucket_url(identifier, postfix=None, cdn=False): endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX @@ -145,17 +160,17 @@ def url_to_filename(url, etag=None): so that TF 2.0 can identify it as a HDF5 file (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) """ - url_bytes = url.encode('utf-8') + url_bytes = url.encode("utf-8") url_hash = sha256(url_bytes) filename = url_hash.hexdigest() if etag: - etag_bytes = etag.encode('utf-8') + etag_bytes = etag.encode("utf-8") etag_hash = sha256(etag_bytes) - filename += '.' + etag_hash.hexdigest() + filename += "." + etag_hash.hexdigest() - if url.endswith('.h5'): - filename += '.h5' + if url.endswith(".h5"): + filename += ".h5" return filename @@ -174,19 +189,21 @@ def filename_to_url(filename, cache_dir=None): if not os.path.exists(cache_path): raise EnvironmentError("file {} not found".format(cache_path)) - meta_path = cache_path + '.json' + meta_path = cache_path + ".json" if not os.path.exists(meta_path): raise EnvironmentError("file {} not found".format(meta_path)) with open(meta_path, encoding="utf-8") as meta_file: metadata = json.load(meta_file) - url = metadata['url'] - etag = metadata['etag'] + url = metadata["url"] + etag = metadata["etag"] return url, etag -def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None): +def cached_path( + url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None +): """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and @@ -207,13 +224,18 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N if is_remote_url(url_or_filename): # URL, so get it from the cache (downloading if necessary) - return get_from_cache(url_or_filename, cache_dir=cache_dir, - force_download=force_download, proxies=proxies, - resume_download=resume_download, user_agent=user_agent) + return get_from_cache( + url_or_filename, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + user_agent=user_agent, + ) elif os.path.exists(url_or_filename): # File, and it exists. return url_or_filename - elif urlparse(url_or_filename).scheme == '': + elif urlparse(url_or_filename).scheme == "": # File, but it doesn't exist. raise EnvironmentError("file {} not found".format(url_or_filename)) else: @@ -273,31 +295,35 @@ def s3_get(url, temp_file, proxies=None): def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0]) if isinstance(user_agent, dict): - ua += "; " + "; ".join( - "{}/{}".format(k, v) for k, v in user_agent.items() - ) + ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) elif isinstance(user_agent, six.string_types): - ua += "; "+ user_agent - headers = { - "user-agent": ua - } + ua += "; " + user_agent + headers = {"user-agent": ua} if resume_size > 0: - headers['Range'] = 'bytes=%d-' % (resume_size,) + headers["Range"] = "bytes=%d-" % (resume_size,) response = requests.get(url, stream=True, proxies=proxies, headers=headers) if response.status_code == 416: # Range not satisfiable return - content_length = response.headers.get('Content-Length') + content_length = response.headers.get("Content-Length") total = resume_size + int(content_length) if content_length is not None else None - progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, - desc="Downloading", disable=bool(logger.level<=logging.INFO)) + progress = tqdm( + unit="B", + unit_scale=True, + total=total, + initial=resume_size, + desc="Downloading", + disable=bool(logger.level <= logging.INFO), + ) for chunk in response.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks + if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() -def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None): +def get_from_cache( + url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None +): """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. @@ -326,7 +352,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag etag = None if sys.version_info[0] == 2 and etag is not None: - etag = etag.decode('utf-8') + etag = etag.decode("utf-8") filename = url_to_filename(url, etag) # get cache path to put the file @@ -337,22 +363,24 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag if not os.path.exists(cache_path) and etag is None: matching_files = [ file - for file in fnmatch.filter(os.listdir(cache_dir), filename + '.*') - if not file.endswith('.json') and not file.endswith('.lock') + for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*") + if not file.endswith(".json") and not file.endswith(".lock") ] if matching_files: cache_path = os.path.join(cache_dir, matching_files[-1]) # Prevent parallel downloads of the same file with a lock. - lock_path = cache_path + '.lock' + lock_path = cache_path + ".lock" with FileLock(lock_path): if resume_download: - incomplete_path = cache_path + '.incomplete' + incomplete_path = cache_path + ".incomplete" + @contextmanager def _resumable_file_manager(): - with open(incomplete_path,'a+b') as f: + with open(incomplete_path, "a+b") as f: yield f + temp_file_manager = _resumable_file_manager if os.path.exists(incomplete_path): resume_size = os.stat(incomplete_path).st_size @@ -366,7 +394,9 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: - logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) + logger.info( + "%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name + ) # GET file object if url.startswith("s3://"): @@ -383,12 +413,12 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag os.rename(temp_file.name, cache_path) logger.info("creating metadata file for %s", cache_path) - meta = {'url': url, 'etag': etag} - meta_path = cache_path + '.json' - with open(meta_path, 'w') as meta_file: + meta = {"url": url, "etag": etag} + meta_path = cache_path + ".json" + with open(meta_path, "w") as meta_file: output_string = json.dumps(meta) if sys.version_info[0] == 2 and isinstance(output_string, str): - output_string = unicode(output_string, 'utf-8') # The beauty of python 2 + output_string = unicode(output_string, "utf-8") # The beauty of python 2 meta_file.write(output_string) return cache_path diff --git a/transformers/hf_api.py b/transformers/hf_api.py index 170732339a..81cc9f7ebb 100644 --- a/transformers/hf_api.py +++ b/transformers/hf_api.py @@ -24,13 +24,14 @@ from tqdm import tqdm ENDPOINT = "https://huggingface.co" + class S3Obj: def __init__( self, - filename, # type: str - LastModified, # type: str - ETag, # type: str - Size, # type: int + filename, # type: str + LastModified, # type: str + ETag, # type: str + Size, # type: int **kwargs ): self.filename = filename @@ -43,13 +44,13 @@ class PresignedUrl: def __init__( self, write, # type: str - access, # type: str - type, # type: str + access, # type: str + type, # type: str **kwargs ): self.write = write self.access = access - self.type = type # mime-type to send to S3. + self.type = type # mime-type to send to S3. class HfApi: @@ -58,8 +59,8 @@ class HfApi: def login( self, - username, # type: str - password, # type: str + username, # type: str + password, # type: str ): # type: (...) -> str """ @@ -78,8 +79,7 @@ class HfApi: return d["token"] def whoami( - self, - token, # type: str + self, token, # type: str ): # type: (...) -> str """ @@ -106,11 +106,7 @@ class HfApi: Call HF API to get a presigned url to upload `filename` to S3. """ path = "{}/api/presign".format(self.endpoint) - r = requests.post( - path, - headers={"authorization": "Bearer {}".format(token)}, - json={"filename": filename}, - ) + r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename},) r.raise_for_status() d = r.json() return PresignedUrl(**d) @@ -126,16 +122,14 @@ class HfApi: urls = self.presign(token, filename=filename) # streaming upload: # https://2.python-requests.org/en/master/user/advanced/#streaming-uploads - # + # # Even though we presign with the correct content-type, # the client still has to specify it when uploading the file. with open(filepath, "rb") as f: pf = TqdmProgressFileReader(f) data = f if pf.total_size > 0 else "" - r = requests.put(urls.write, data=data, headers={ - "content-type": urls.type, - }) + r = requests.put(urls.write, data=data, headers={"content-type": urls.type,}) r.raise_for_status() pf.close() return urls.access @@ -152,7 +146,6 @@ class HfApi: return [S3Obj(**x) for x in d] - class TqdmProgressFileReader: """ Wrap an io.BufferedReader `f` (such as the output of `open(…, "rb")`) @@ -161,12 +154,12 @@ class TqdmProgressFileReader: see github.com/huggingface/transformers/pull/2078#discussion_r354739608 for implementation details. """ + def __init__( - self, - f # type: io.BufferedReader + self, f # type: io.BufferedReader ): self.f = f - self.total_size = os.fstat(f.fileno()).st_size # type: int + self.total_size = os.fstat(f.fileno()).st_size # type: int self.pbar = tqdm(total=self.total_size, leave=False) if six.PY3: # does not work unless PY3 @@ -182,7 +175,6 @@ class TqdmProgressFileReader: self.pbar.close() - class HfFolder: path_token = expanduser("~/.huggingface/token") @@ -201,7 +193,7 @@ class HfFolder: if e.errno != os.errno.EEXIST: raise e pass - with open(cls.path_token, 'w+') as f: + with open(cls.path_token, "w+") as f: f.write(token) @classmethod @@ -210,7 +202,7 @@ class HfFolder: Get token or None if not existent. """ try: - with open(cls.path_token, 'r') as f: + with open(cls.path_token, "r") as f: return f.read() except: # this is too wide. When Py2 is dead use: diff --git a/transformers/modelcard.py b/transformers/modelcard.py index 4a879235ae..e6b1982e96 100644 --- a/transformers/modelcard.py +++ b/transformers/modelcard.py @@ -14,8 +14,7 @@ # limitations under the License. """ Configuration base class and utilities.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import copy import json @@ -25,8 +24,15 @@ from io import open from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP -from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \ - cached_path, is_remote_url, hf_bucket_url +from .file_utils import ( + CONFIG_NAME, + MODEL_CARD_NAME, + WEIGHTS_NAME, + TF2_WEIGHTS_NAME, + cached_path, + is_remote_url, + hf_bucket_url, +) logger = logging.getLogger(__name__) @@ -48,17 +54,18 @@ class ModelCard(object): Parameters: """ + def __init__(self, **kwargs): # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers) - self.model_details = kwargs.pop('model_details', {}) - self.intended_use = kwargs.pop('intended_use', {}) - self.factors = kwargs.pop('factors', {}) - self.metrics = kwargs.pop('metrics', {}) - self.evaluation_data = kwargs.pop('evaluation_data', {}) - self.training_data = kwargs.pop('training_data', {}) - self.quantitative_analyses = kwargs.pop('quantitative_analyses', {}) - self.ethical_considerations = kwargs.pop('ethical_considerations', {}) - self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {}) + self.model_details = kwargs.pop("model_details", {}) + self.intended_use = kwargs.pop("intended_use", {}) + self.factors = kwargs.pop("factors", {}) + self.metrics = kwargs.pop("metrics", {}) + self.evaluation_data = kwargs.pop("evaluation_data", {}) + self.training_data = kwargs.pop("training_data", {}) + self.quantitative_analyses = kwargs.pop("quantitative_analyses", {}) + self.ethical_considerations = kwargs.pop("ethical_considerations", {}) + self.caveats_and_recommendations = kwargs.pop("caveats_and_recommendations", {}) # Open additional attributes for key, value in kwargs.items(): @@ -122,10 +129,10 @@ class ModelCard(object): modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False) """ - cache_dir = kwargs.pop('cache_dir', None) - proxies = kwargs.pop('proxies', None) - find_from_standard_name = kwargs.pop('find_from_standard_name', True) - return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) + cache_dir = kwargs.pop("cache_dir", None) + proxies = kwargs.pop("proxies", None) + find_from_standard_name = kwargs.pop("find_from_standard_name", True) + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: # For simplicity we use the same pretrained url than the configuration files @@ -145,36 +152,43 @@ class ModelCard(object): try: # Load from URL or cache if already cached - resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True, - proxies=proxies, resume_download=False) + resolved_model_card_file = cached_path( + model_card_file, cache_dir=cache_dir, force_download=True, proxies=proxies, resume_download=False + ) if resolved_model_card_file == model_card_file: logger.info("loading model card file {}".format(model_card_file)) else: - logger.info("loading model card file {} from cache at {}".format( - model_card_file, resolved_model_card_file)) + logger.info( + "loading model card file {} from cache at {}".format(model_card_file, resolved_model_card_file) + ) # Load model card modelcard = cls.from_json_file(resolved_model_card_file) except EnvironmentError: if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: - logger.warning("Couldn't reach server at '{}' to download model card file.".format( - model_card_file)) + logger.warning("Couldn't reach server at '{}' to download model card file.".format(model_card_file)) else: - logger.warning("Model name '{}' was not found in model name list ({}). " \ - "We assumed '{}' was a path or url to a model card file named {} or " \ - "a directory containing such a file but couldn't find any such file at this path or url.".format( + logger.warning( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url to a model card file named {} or " + "a directory containing such a file but couldn't find any such file at this path or url.".format( pretrained_model_name_or_path, - ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), - model_card_file, MODEL_CARD_NAME)) + ", ".join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), + model_card_file, + MODEL_CARD_NAME, + ) + ) logger.warning("Creating an empty model card.") # We fall back on creating an empty model card modelcard = cls() except json.JSONDecodeError: - logger.warning("Couldn't reach server at '{}' to download model card file or " - "model card file is not a valid JSON file. " - "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)) + logger.warning( + "Couldn't reach server at '{}' to download model card file or " + "model card file is not a valid JSON file. " + "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file) + ) logger.warning("Creating an empty model card.") # We fall back on creating an empty model card @@ -203,7 +217,7 @@ class ModelCard(object): @classmethod def from_json_file(cls, json_file): """Constructs a `ModelCard` from a json file of parameters.""" - with open(json_file, "r", encoding='utf-8') as reader: + with open(json_file, "r", encoding="utf-8") as reader: text = reader.read() dict_obj = json.loads(text) return cls(**dict_obj) @@ -225,5 +239,5 @@ class ModelCard(object): def to_json_file(self, json_file_path): """ Save this instance to a json file.""" - with open(json_file_path, "w", encoding='utf-8') as writer: + with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string()) diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py index f833b6d6bf..3d55bcd64d 100644 --- a/transformers/modeling_albert.py +++ b/transformers/modeling_albert.py @@ -1,4 +1,3 @@ - # coding=utf-8 # Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. # @@ -30,14 +29,14 @@ logger = logging.getLogger(__name__) ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin", - 'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin", - 'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin", - 'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin", - 'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin", - 'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin", - 'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin", - 'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin", + "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-pytorch_model.bin", + "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-pytorch_model.bin", + "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-pytorch_model.bin", + "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-pytorch_model.bin", + "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-pytorch_model.bin", + "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-pytorch_model.bin", + "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-pytorch_model.bin", + "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-pytorch_model.bin", } @@ -48,8 +47,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path): import numpy as np import tensorflow as tf except ImportError: - logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) @@ -65,7 +66,7 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path): for name, array in zip(names, arrays): print(name) - + for name, array in zip(names, arrays): original_name = name @@ -75,10 +76,10 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path): # Renaming and simplifying name = name.replace("ffn_1", "ffn") name = name.replace("bert/", "albert/") - name = name.replace("attention_1", "attention") + name = name.replace("attention_1", "attention") name = name.replace("transform/", "") - name = name.replace("LayerNorm_1", "full_layer_layer_norm") - name = name.replace("LayerNorm", "attention/LayerNorm") + name = name.replace("LayerNorm_1", "full_layer_layer_norm") + name = name.replace("LayerNorm", "attention/LayerNorm") name = name.replace("transformer/", "") # The feed forward layer had an 'intermediate' step which has been abstracted away @@ -97,19 +98,19 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path): name = name.replace("predictions/attention", "predictions") # Naming was changed to be more explicit - name = name.replace("embeddings/attention", "embeddings") - name = name.replace("inner_group_", "albert_layers/") - name = name.replace("group_", "albert_layer_groups/") + name = name.replace("embeddings/attention", "embeddings") + name = name.replace("inner_group_", "albert_layers/") + name = name.replace("group_", "albert_layer_groups/") # Classifier if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name): name = "classifier/" + name - # No ALBERT model currently handles the next sentence prediction task + # No ALBERT model currently handles the next sentence prediction task if "seq_relationship" in name: continue - name = name.split('/') + name = name.split("/") # Ignore the gradients applied by the LAMB/ADAM optimizers. if "adam_m" in name or "adam_v" in name or "global_step" in name: @@ -118,19 +119,19 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path): pointer = model for m_name in name: - if re.fullmatch(r'[A-Za-z]+_\d+', m_name): - l = re.split(r'_(\d+)', m_name) + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + l = re.split(r"_(\d+)", m_name) else: l = [m_name] - if l[0] == 'kernel' or l[0] == 'gamma': - pointer = getattr(pointer, 'weight') - elif l[0] == 'output_bias' or l[0] == 'beta': - pointer = getattr(pointer, 'bias') - elif l[0] == 'output_weights': - pointer = getattr(pointer, 'weight') - elif l[0] == 'squad': - pointer = getattr(pointer, 'classifier') + if l[0] == "kernel" or l[0] == "gamma": + pointer = getattr(pointer, "weight") + elif l[0] == "output_bias" or l[0] == "beta": + pointer = getattr(pointer, "bias") + elif l[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif l[0] == "squad": + pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, l[0]) @@ -141,9 +142,9 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path): num = int(l[1]) pointer = pointer[num] - if m_name[-11:] == '_embeddings': - pointer = getattr(pointer, 'weight') - elif m_name == 'kernel': + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape @@ -160,6 +161,7 @@ class AlbertEmbeddings(BertEmbeddings): """ Construct the embeddings from word, position and token_type embeddings. """ + def __init__(self, config): super(AlbertEmbeddings, self).__init__(config) @@ -175,7 +177,7 @@ class AlbertAttention(BertSelfAttention): self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads - self.hidden_size = config.hidden_size + self.hidden_size = config.hidden_size self.attention_head_size = config.hidden_size // config.num_attention_heads self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.dense = nn.Linear(config.hidden_size, config.hidden_size) @@ -237,10 +239,13 @@ class AlbertAttention(BertSelfAttention): context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) reshaped_context_layer = context_layer.view(*new_context_layer_shape) - # Should find a better way to do this - w = self.dense.weight.t().view(self.num_attention_heads, self.attention_head_size, self.hidden_size).to(context_layer.dtype) + w = ( + self.dense.weight.t() + .view(self.num_attention_heads, self.attention_head_size, self.hidden_size) + .to(context_layer.dtype) + ) b = self.dense.bias.to(context_layer.dtype) projected_context_layer = torch.einsum("bfnd,ndh->bfh", context_layer, w) + b @@ -252,11 +257,11 @@ class AlbertAttention(BertSelfAttention): class AlbertLayer(nn.Module): def __init__(self, config): super(AlbertLayer, self).__init__() - + self.config = config self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.attention = AlbertAttention(config) - self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) + self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) self.activation = ACT2FN[config.hidden_act] @@ -273,7 +278,7 @@ class AlbertLayer(nn.Module): class AlbertLayerGroup(nn.Module): def __init__(self, config): super(AlbertLayerGroup, self).__init__() - + self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)]) @@ -303,7 +308,7 @@ class AlbertLayerGroup(nn.Module): class AlbertTransformer(nn.Module): def __init__(self, config): super(AlbertTransformer, self).__init__() - + self.config = config self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -327,8 +332,12 @@ class AlbertTransformer(nn.Module): # Index of the layer inside the group layer_idx = int(i - group_idx * layers_per_group) - - layer_group_output = self.albert_layer_groups[group_idx](hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]) + + layer_group_output = self.albert_layer_groups[group_idx]( + hidden_states, + attention_mask, + head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], + ) hidden_states = layer_group_output[0] if self.output_attentions: @@ -337,7 +346,6 @@ class AlbertTransformer(nn.Module): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - outputs = (hidden_states,) if self.output_hidden_states: outputs = outputs + (all_hidden_states,) @@ -346,11 +354,11 @@ class AlbertTransformer(nn.Module): return outputs # last-layer hidden state, (all hidden states), (all attentions) - class AlbertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = AlbertConfig pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "albert" @@ -431,8 +439,12 @@ ALBERT_INPUTS_DOCSTRING = r""" ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ -@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.", - ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.", + ALBERT_START_DOCSTRING, + ALBERT_INPUTS_DOCSTRING, +) class AlbertModel(AlbertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -500,8 +512,15 @@ class AlbertModel(AlbertPreTrainedModel): inner_group_idx = int(layer - group_idx * self.config.inner_group_num) self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, - inputs_embeds=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -520,31 +539,37 @@ class AlbertModel(AlbertPreTrainedModel): token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.num_hidden_layers - embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds) - encoder_outputs = self.encoder(embedding_output, - extended_attention_mask, - head_mask=head_mask) + embedding_output = self.embeddings( + input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) sequence_output = encoder_outputs[0] pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) - outputs = (sequence_output, pooled_output) + encoder_outputs[1:] # add hidden_states and attentions if they are here + outputs = (sequence_output, pooled_output) + encoder_outputs[ + 1: + ] # add hidden_states and attentions if they are here return outputs + class AlbertMLMHead(nn.Module): def __init__(self, config): super(AlbertMLMHead, self).__init__() @@ -566,7 +591,9 @@ class AlbertMLMHead(nn.Module): return prediction_scores -@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) +@add_start_docstrings( + "Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING +) class AlbertForMaskedLM(AlbertPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -602,21 +629,28 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): """ Make sure we are sharing the input and output embeddings. Export to TorchScript can't handle parameter sharing so we are cloning them instead. """ - self._tie_or_clone_weights(self.predictions.decoder, - self.albert.embeddings.word_embeddings) + self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings) def get_output_embeddings(self): return self.predictions.decoder - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + masked_lm_labels=None, + ): outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, - inputs_embeds=inputs_embeds + inputs_embeds=inputs_embeds, ) sequence_outputs = outputs[0] @@ -631,9 +665,12 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): return outputs -@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) + ALBERT_START_DOCSTRING, + ALBERT_INPUTS_DOCSTRING, +) class AlbertForSequenceClassification(AlbertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -665,6 +702,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(AlbertForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels @@ -675,8 +713,16 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, - position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): outputs = self.albert( input_ids=input_ids, @@ -684,7 +730,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, - inputs_embeds=inputs_embeds + inputs_embeds=inputs_embeds, ) pooled_output = outputs[1] @@ -707,10 +753,12 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) - -@add_start_docstrings("""Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) + ALBERT_START_DOCSTRING, + ALBERT_INPUTS_DOCSTRING, +) class AlbertForQuestionAnswering(AlbertPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -752,6 +800,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): """ + def __init__(self, config): super(AlbertForQuestionAnswering, self).__init__(config) self.num_labels = config.num_labels @@ -761,8 +810,17 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, - inputs_embeds=None, start_positions=None, end_positions=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + ): outputs = self.albert( input_ids=input_ids, @@ -770,7 +828,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, - inputs_embeds=inputs_embeds + inputs_embeds=inputs_embeds, ) sequence_output = outputs[0] diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index 6b49efd378..31e9ee6bd2 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -18,31 +18,87 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging -from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig, - DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, - TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig) +from .configuration_auto import ( + AlbertConfig, + BertConfig, + CamembertConfig, + CTRLConfig, + DistilBertConfig, + GPT2Config, + OpenAIGPTConfig, + RobertaConfig, + TransfoXLConfig, + XLMConfig, + XLNetConfig, + XLMRobertaConfig, +) -from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \ - BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_bert import ( + BertModel, + BertForMaskedLM, + BertForSequenceClassification, + BertForQuestionAnswering, + BertForTokenClassification, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, +) from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \ - XLNetForTokenClassification, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, \ - XLM_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \ - RobertaForTokenClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \ - DistilBertForSequenceClassification, DistilBertForTokenClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \ - CamembertForMultipleChoice, CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \ - AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_xlnet import ( + XLNetModel, + XLNetLMHeadModel, + XLNetForSequenceClassification, + XLNetForQuestionAnswering, + XLNetForTokenClassification, + XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_xlm import ( + XLMModel, + XLMWithLMHeadModel, + XLMForSequenceClassification, + XLMForQuestionAnswering, + XLM_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_roberta import ( + RobertaModel, + RobertaForMaskedLM, + RobertaForSequenceClassification, + RobertaForTokenClassification, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_distilbert import ( + DistilBertModel, + DistilBertForQuestionAnswering, + DistilBertForMaskedLM, + DistilBertForSequenceClassification, + DistilBertForTokenClassification, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_camembert import ( + CamembertModel, + CamembertForMaskedLM, + CamembertForSequenceClassification, + CamembertForMultipleChoice, + CamembertForTokenClassification, + CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_albert import ( + AlbertModel, + AlbertForMaskedLM, + AlbertForSequenceClassification, + AlbertForQuestionAnswering, + ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, +) from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \ - XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_xlm_roberta import ( + XLMRobertaModel, + XLMRobertaForMaskedLM, + XLMRobertaForSequenceClassification, + XLMRobertaForMultipleChoice, + XLMRobertaForTokenClassification, + XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, +) from .modeling_utils import PreTrainedModel, SequenceSummary @@ -51,7 +107,8 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) +ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict( + (key, value) for pretrained_map in [ BERT_PRETRAINED_MODEL_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, @@ -66,8 +123,9 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_MODEL_ARCHIVE_MAP, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, - ] - for key, value, in pretrained_map.items()) + ] + for key, value, in pretrained_map.items() +) class AutoModel(object): @@ -98,10 +156,13 @@ class AutoModel(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("AutoModel is designed to be instantiated " + raise EnvironmentError( + "AutoModel is designed to be instantiated " "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or " - "`AutoModel.from_config(config)` methods.") + "`AutoModel.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -232,35 +293,39 @@ class AutoModel(object): model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 't5' in pretrained_model_name_or_path: + if "t5" in pretrained_model_name_or_path: return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: + elif "distilbert" in pretrained_model_name_or_path: return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'albert' in pretrained_model_name_or_path: + elif "albert" in pretrained_model_name_or_path: return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'camembert' in pretrained_model_name_or_path: + elif "camembert" in pretrained_model_name_or_path: return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm-roberta' in pretrained_model_name_or_path: + elif "xlm-roberta" in pretrained_model_name_or_path: return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: + elif "roberta" in pretrained_model_name_or_path: return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'openai-gpt' in pretrained_model_name_or_path: + elif "openai-gpt" in pretrained_model_name_or_path: return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'gpt2' in pretrained_model_name_or_path: + elif "gpt2" in pretrained_model_name_or_path: return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'transfo-xl' in pretrained_model_name_or_path: + elif "transfo-xl" in pretrained_model_name_or_path: return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'ctrl' in pretrained_model_name_or_path: + elif "ctrl" in pretrained_model_name_or_path: return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format( + pretrained_model_name_or_path + ) + ) class AutoModelWithLMHead(object): @@ -291,10 +356,13 @@ class AutoModelWithLMHead(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated " + raise EnvironmentError( + "AutoModelWithLMHead is designed to be instantiated " "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " - "`AutoModelWithLMHead.from_config(config)` methods.") + "`AutoModelWithLMHead.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -423,35 +491,39 @@ class AutoModelWithLMHead(object): model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 't5' in pretrained_model_name_or_path: + if "t5" in pretrained_model_name_or_path: return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: + elif "distilbert" in pretrained_model_name_or_path: return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'albert' in pretrained_model_name_or_path: + elif "albert" in pretrained_model_name_or_path: return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'camembert' in pretrained_model_name_or_path: + elif "camembert" in pretrained_model_name_or_path: return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm-roberta' in pretrained_model_name_or_path: + elif "xlm-roberta" in pretrained_model_name_or_path: return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: + elif "roberta" in pretrained_model_name_or_path: return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'openai-gpt' in pretrained_model_name_or_path: + elif "openai-gpt" in pretrained_model_name_or_path: return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'gpt2' in pretrained_model_name_or_path: + elif "gpt2" in pretrained_model_name_or_path: return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'transfo-xl' in pretrained_model_name_or_path: + elif "transfo-xl" in pretrained_model_name_or_path: return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'ctrl' in pretrained_model_name_or_path: + elif "ctrl" in pretrained_model_name_or_path: return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format( + pretrained_model_name_or_path + ) + ) class AutoModelForSequenceClassification(object): @@ -477,10 +549,13 @@ class AutoModelForSequenceClassification(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated " + raise EnvironmentError( + "AutoModelForSequenceClassification is designed to be instantiated " "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " - "`AutoModelForSequenceClassification.from_config(config)` methods.") + "`AutoModelForSequenceClassification.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -597,25 +672,39 @@ class AutoModelForSequenceClassification(object): model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: - return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'albert' in pretrained_model_name_or_path: - return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'camembert' in pretrained_model_name_or_path: - return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm-roberta' in pretrained_model_name_or_path: - return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: - return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + if "distilbert" in pretrained_model_name_or_path: + return DistilBertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "albert" in pretrained_model_name_or_path: + return AlbertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "camembert" in pretrained_model_name_or_path: + return CamembertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "xlm-roberta" in pretrained_model_name_or_path: + return XLMRobertaForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "roberta" in pretrained_model_name_or_path: + return RobertaForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "bert" in pretrained_model_name_or_path: return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format( + pretrained_model_name_or_path + ) + ) class AutoModelForQuestionAnswering(object): @@ -638,10 +727,13 @@ class AutoModelForQuestionAnswering(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated " + raise EnvironmentError( + "AutoModelForQuestionAnswering is designed to be instantiated " "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " - "`AutoModelForQuestionAnswering.from_config(config)` methods.") + "`AutoModelForQuestionAnswering.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -745,26 +837,30 @@ class AutoModelForQuestionAnswering(object): model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: + if "distilbert" in pretrained_model_name_or_path: return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'albert' in pretrained_model_name_or_path: + elif "albert" in pretrained_model_name_or_path: return AlbertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path) + ) class AutoModelForTokenClassification: def __init__(self): - raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated " - "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " - "`AutoModelForTokenClassification.from_config(config)` methods.") + raise EnvironmentError( + "AutoModelForTokenClassification is designed to be instantiated " + "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelForTokenClassification.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -797,7 +893,7 @@ class AutoModelForTokenClassification: elif isinstance(config, XLMRobertaConfig): return XLMRobertaForTokenClassification(config) raise ValueError("Unrecognized configuration class {}".format(config)) - + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): r""" Instantiates one of the question answering model classes of the library @@ -870,18 +966,28 @@ class AutoModelForTokenClassification: model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 'camembert' in pretrained_model_name_or_path: - return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: - return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm-roberta' in pretrained_model_name_or_path: - return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: + if "camembert" in pretrained_model_name_or_path: + return CamembertForTokenClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "distilbert" in pretrained_model_name_or_path: + return DistilBertForTokenClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "xlm-roberta" in pretrained_model_name_or_path: + return XLMRobertaForTokenClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "roberta" in pretrained_model_name_or_path: return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format( + pretrained_model_name_or_path + ) + ) diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index ca07a81aea..0994e832de 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -33,27 +33,27 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", - 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin", - 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", - 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", - 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", - 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", - 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", - 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", - 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", - 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin", - 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", - 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", - 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin", + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", + "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", + "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", + "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", + "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin", + "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", + "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", + "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", + "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin", + "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin", } @@ -65,8 +65,10 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): import numpy as np import tensorflow as tf except ImportError: - logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) @@ -81,7 +83,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): arrays.append(array) for name, array in zip(names, arrays): - name = name.split('/') + name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m", "global_step"] for n in name): @@ -89,18 +91,18 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): continue pointer = model for m_name in name: - if re.fullmatch(r'[A-Za-z]+_\d+', m_name): - l = re.split(r'_(\d+)', m_name) + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + l = re.split(r"_(\d+)", m_name) else: l = [m_name] - if l[0] == 'kernel' or l[0] == 'gamma': - pointer = getattr(pointer, 'weight') - elif l[0] == 'output_bias' or l[0] == 'beta': - pointer = getattr(pointer, 'bias') - elif l[0] == 'output_weights': - pointer = getattr(pointer, 'weight') - elif l[0] == 'squad': - pointer = getattr(pointer, 'classifier') + if l[0] == "kernel" or l[0] == "gamma": + pointer = getattr(pointer, "weight") + elif l[0] == "output_bias" or l[0] == "beta": + pointer = getattr(pointer, "bias") + elif l[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif l[0] == "squad": + pointer = getattr(pointer, "classifier") else: try: pointer = getattr(pointer, l[0]) @@ -110,9 +112,9 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): if len(l) >= 2: num = int(l[1]) pointer = pointer[num] - if m_name[-11:] == '_embeddings': - pointer = getattr(pointer, 'weight') - elif m_name == 'kernel': + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape @@ -157,6 +159,7 @@ BertLayerNorm = torch.nn.LayerNorm class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ + def __init__(self, config): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) @@ -199,7 +202,8 @@ class BertSelfAttention(nn.Module): if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads @@ -217,7 +221,14 @@ class BertSelfAttention(nn.Module): x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) - def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None): + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): mixed_query_layer = self.query(hidden_states) # If this is instantiated as a cross-attention module, the keys @@ -307,8 +318,17 @@ class BertAttention(nn.Module): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None): - self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask) + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): + self_outputs = self.self( + hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask + ) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them return outputs @@ -353,13 +373,22 @@ class BertLayer(nn.Module): self.intermediate = BertIntermediate(config) self.output = BertOutput(config) - def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None): + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) attention_output = self_attention_outputs[0] outputs = self_attention_outputs[1:] # add self attentions if we output attention weights if self.is_decoder and encoder_hidden_states is not None: - cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask) + cross_attention_outputs = self.crossattention( + attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask + ) attention_output = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights @@ -376,14 +405,23 @@ class BertEncoder(nn.Module): self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) - def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None): + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask) + layer_outputs = layer_module( + hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask + ) hidden_states = layer_outputs[0] if self.output_attentions: @@ -440,9 +478,7 @@ class BertLMPredictionHead(nn.Module): # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. - self.decoder = nn.Linear(config.hidden_size, - config.vocab_size, - bias=False) + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) @@ -488,6 +524,7 @@ class BertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = BertConfig pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_bert @@ -581,8 +618,12 @@ BERT_INPUTS_DOCSTRING = r""" ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ -@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class BertModel(BertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -612,6 +653,7 @@ class BertModel(BertPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(BertModel, self).__init__(config) self.config = config @@ -636,8 +678,17 @@ class BertModel(BertPreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, - head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): """ Forward pass on the Model. The model can behave as an encoder (with only self-attention) as well @@ -681,12 +732,18 @@ class BertModel(BertPreTrainedModel): batch_size, seq_length = input_shape seq_ids = torch.arange(seq_length, device=device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] - causal_mask = causal_mask.to(torch.long) # not converting to long will cause errors with pytorch version < 1.3 + causal_mask = causal_mask.to( + torch.long + ) # not converting to long will cause errors with pytorch version < 1.3 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] else: - raise ValueError("Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(input_shape, attention_mask.shape)) + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for @@ -709,10 +766,15 @@ class BertModel(BertPreTrainedModel): elif encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] else: - raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape, - encoder_attention_mask.shape)) + raise ValueError( + "Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format( + encoder_hidden_shape, encoder_attention_mask.shape + ) + ) - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + encoder_extended_attention_mask = encoder_extended_attention_mask.to( + dtype=next(self.parameters()).dtype + ) # fp16 compatibility encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 else: encoder_extended_attention_mask = None @@ -727,28 +789,40 @@ class BertModel(BertPreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.num_hidden_layers - embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) - encoder_outputs = self.encoder(embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask) + embedding_output = self.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + outputs = (sequence_output, pooled_output,) + encoder_outputs[ + 1: + ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training: +@add_start_docstrings( + """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, - BERT_START_DOCSTRING, - BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class BertForPreTraining(BertPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -786,6 +860,7 @@ class BertForPreTraining(BertPreTrainedModel): prediction_scores, seq_relationship_scores = outputs[:2] """ + def __init__(self, config): super(BertForPreTraining, self).__init__(config) @@ -797,20 +872,33 @@ class BertForPreTraining(BertPreTrainedModel): def get_output_embeddings(self): return self.cls.predictions.decoder - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, next_sentence_label=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + masked_lm_labels=None, + next_sentence_label=None, + ): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) - outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + outputs = (prediction_scores, seq_relationship_score,) + outputs[ + 2: + ] # add hidden states and attention if they are here if masked_lm_labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() @@ -822,9 +910,9 @@ class BertForPreTraining(BertPreTrainedModel): return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, - BERT_START_DOCSTRING, - BERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING +) class BertForMaskedLM(BertPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -862,6 +950,7 @@ class BertForMaskedLM(BertPreTrainedModel): loss, prediction_scores = outputs[:2] """ + def __init__(self, config): super(BertForMaskedLM, self).__init__(config) @@ -873,17 +962,30 @@ class BertForMaskedLM(BertPreTrainedModel): def get_output_embeddings(self): return self.cls.predictions.decoder - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + masked_lm_labels=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + lm_labels=None, + ): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask) + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) @@ -912,9 +1014,11 @@ class BertForMaskedLM(BertPreTrainedModel): return outputs # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """, - BERT_START_DOCSTRING, - BERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class BertForNextSentencePrediction(BertPreTrainedModel): r""" **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -945,6 +1049,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): seq_relationship_scores = outputs[0] """ + def __init__(self, config): super(BertForNextSentencePrediction, self).__init__(config) @@ -953,15 +1058,25 @@ class BertForNextSentencePrediction(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - next_sentence_label=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + next_sentence_label=None, + ): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) pooled_output = outputs[1] @@ -976,10 +1091,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel): return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - BERT_START_DOCSTRING, - BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class BertForSequenceClassification(BertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -1011,6 +1128,7 @@ class BertForSequenceClassification(BertPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(BertForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels @@ -1021,15 +1139,25 @@ class BertForSequenceClassification(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, - position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) pooled_output = outputs[1] @@ -1051,10 +1179,12 @@ class BertForSequenceClassification(BertPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of +@add_start_docstrings( + """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - BERT_START_DOCSTRING, - BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class BertForMultipleChoice(BertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -1087,6 +1217,7 @@ class BertForMultipleChoice(BertPreTrainedModel): loss, classification_scores = outputs[:2] """ + def __init__(self, config): super(BertForMultipleChoice, self).__init__(config) @@ -1096,8 +1227,16 @@ class BertForMultipleChoice(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, - position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): num_choices = input_ids.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) @@ -1105,12 +1244,14 @@ class BertForMultipleChoice(BertPreTrainedModel): token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) pooled_output = outputs[1] @@ -1128,10 +1269,12 @@ class BertForMultipleChoice(BertPreTrainedModel): return outputs # (loss), reshaped_logits, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - BERT_START_DOCSTRING, - BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class BertForTokenClassification(BertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -1161,6 +1304,7 @@ class BertForTokenClassification(BertPreTrainedModel): loss, scores = outputs[:2] """ + def __init__(self, config): super(BertForTokenClassification, self).__init__(config) self.num_labels = config.num_labels @@ -1171,15 +1315,25 @@ class BertForTokenClassification(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, - position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] @@ -1202,10 +1356,12 @@ class BertForTokenClassification(BertPreTrainedModel): return outputs # (loss), scores, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - BERT_START_DOCSTRING, - BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class BertForQuestionAnswering(BertPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -1247,6 +1403,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): """ + def __init__(self, config): super(BertForQuestionAnswering, self).__init__(config) self.num_labels = config.num_labels @@ -1256,15 +1413,26 @@ class BertForQuestionAnswering(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - start_positions=None, end_positions=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + ): - outputs = self.bert(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py index 1b808bfd82..2a7a7a7332 100644 --- a/transformers/modeling_camembert.py +++ b/transformers/modeling_camembert.py @@ -15,19 +15,24 @@ # limitations under the License. """PyTorch CamemBERT model. """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging -from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification +from .modeling_roberta import ( + RobertaModel, + RobertaForMaskedLM, + RobertaForSequenceClassification, + RobertaForMultipleChoice, + RobertaForTokenClassification, +) from .configuration_camembert import CamembertConfig from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin", + "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-pytorch_model.bin", } @@ -100,8 +105,12 @@ CAMEMBERT_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", - CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", + CAMEMBERT_START_DOCSTRING, + CAMEMBERT_INPUTS_DOCSTRING, +) class CamembertModel(RobertaModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -149,8 +158,11 @@ class CamembertModel(RobertaModel): pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""CamemBERT Model with a `language modeling` head on top. """, - CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top. """, + CAMEMBERT_START_DOCSTRING, + CAMEMBERT_INPUTS_DOCSTRING, +) class CamembertForMaskedLM(RobertaForMaskedLM): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -185,9 +197,12 @@ class CamembertForMaskedLM(RobertaForMaskedLM): pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer +@add_start_docstrings( + """CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING) + CAMEMBERT_START_DOCSTRING, + CAMEMBERT_INPUTS_DOCSTRING, +) class CamembertForSequenceClassification(RobertaForSequenceClassification): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -223,9 +238,12 @@ class CamembertForSequenceClassification(RobertaForSequenceClassification): pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""CamemBERT Model with a multiple choice classification head on top (a linear layer on top of +@add_start_docstrings( + """CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING) + CAMEMBERT_START_DOCSTRING, + CAMEMBERT_INPUTS_DOCSTRING, +) class CamembertForMultipleChoice(RobertaForMultipleChoice): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -257,9 +275,12 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice): pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING) + CAMEMBERT_START_DOCSTRING, + CAMEMBERT_INPUTS_DOCSTRING, +) class CamembertForTokenClassification(RobertaForTokenClassification): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py index fabb79efd8..37c15cf54f 100644 --- a/transformers/modeling_ctrl.py +++ b/transformers/modeling_ctrl.py @@ -40,14 +40,17 @@ CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf- def angle_defn(pos, i, d_model_size): - angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / d_model_size) + angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size) return pos * angle_rates + def positional_encoding(position, d_model_size, dtype): # create the sinusoidal pattern for the positional encoding - angle_rads = (angle_defn(torch.arange(position, dtype=dtype).unsqueeze(1), - torch.arange(d_model_size, dtype=dtype).unsqueeze(0), - d_model_size)) + angle_rads = angle_defn( + torch.arange(position, dtype=dtype).unsqueeze(1), + torch.arange(d_model_size, dtype=dtype).unsqueeze(0), + d_model_size, + ) sines = torch.sin(angle_rads[:, 0::2]) cosines = torch.cos(angle_rads[:, 1::2]) @@ -55,22 +58,23 @@ def positional_encoding(position, d_model_size, dtype): pos_encoding = torch.cat([sines, cosines], dim=-1) return pos_encoding + def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): # calculate attention - matmul_qk = torch.matmul(q, k.permute(0,1,3,2)) + matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2)) dk = k.shape[-1] scaled_attention_logits = matmul_qk / np.sqrt(dk) if mask is not None: nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1) - scaled_attention_logits += (mask[ns-nd:ns, :ns] * -1e4) + scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4 if attention_mask is not None: # Apply the attention mask scaled_attention_logits = scaled_attention_logits + attention_mask - attention_weights = torch.softmax(scaled_attention_logits, dim=-1) + attention_weights = torch.softmax(scaled_attention_logits, dim=-1) # Mask heads if we want to if head_mask is not None: @@ -128,11 +132,8 @@ class MultiHeadAttention(torch.nn.Module): return outputs - def point_wise_feed_forward_network(d_model_size, dff): - return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), - torch.nn.ReLU(), - torch.nn.Linear(dff, d_model_size)) + return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size)) class EncoderLayer(torch.nn.Module): @@ -150,10 +151,9 @@ class EncoderLayer(torch.nn.Module): def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None): normed = self.layernorm1(x) - attn_outputs = self.multi_head_attention(normed, normed, normed, mask, - layer_past=layer_past, - attention_mask=attention_mask, - head_mask=head_mask) + attn_outputs = self.multi_head_attention( + normed, normed, normed, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask + ) attn_output = attn_outputs[0] attn_output = self.dropout1(attn_output) out1 = x + attn_output @@ -171,6 +171,7 @@ class CTRLPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = CTRLConfig pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -244,8 +245,12 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs: than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", - CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", + CTRL_START_DOCSTRING, + CTRL_INPUTS_DOCSTRING, +) class CTRLModel(CTRLPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -273,6 +278,7 @@ class CTRLModel(CTRLPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(CTRLModel, self).__init__(config) self.output_hidden_states = config.output_hidden_states @@ -287,11 +293,12 @@ class CTRLModel(CTRLPreTrainedModel): self.w = nn.Embedding(config.vocab_size, config.n_embd) self.dropout = nn.Dropout(config.embd_pdrop) - self.h = nn.ModuleList([EncoderLayer(config.n_embd, - config.n_head, - config.dff, - config.resid_pdrop, - config.output_attentions) for _ in range(config.n_layer)]) + self.h = nn.ModuleList( + [ + EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, config.output_attentions) + for _ in range(config.n_layer) + ] + ) self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.init_weights() @@ -309,7 +316,16 @@ class CTRLModel(CTRLPreTrainedModel): for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) - def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None): + def forward( + self, + input_ids=None, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -345,7 +361,7 @@ class CTRLModel(CTRLPreTrainedModel): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed @@ -357,8 +373,12 @@ class CTRLModel(CTRLPreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.n_layer @@ -391,11 +411,9 @@ class CTRLModel(CTRLPreTrainedModel): for i, (h, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) - outputs = h(hidden_states, - mask, - layer_past=layer_past, - attention_mask=attention_mask, - head_mask=head_mask[i]) + outputs = h( + hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i] + ) hidden_states, present = outputs[:2] if self.output_past: presents = presents + (present,) @@ -421,8 +439,12 @@ class CTRLModel(CTRLPreTrainedModel): return outputs -@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top -(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING) +@add_start_docstrings( + """The CTRL Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, + CTRL_START_DOCSTRING, + CTRL_INPUTS_DOCSTRING, +) class CTRLLMHeadModel(CTRLPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -463,6 +485,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(CTRLLMHeadModel, self).__init__(config) self.transformer = CTRLModel(config) @@ -473,15 +496,26 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - labels=None): - transformer_outputs = self.transformer(input_ids, - past=past, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) hidden_states = transformer_outputs[0] @@ -495,8 +529,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py index 7098529c9e..7345c23651 100644 --- a/transformers/modeling_distilbert.py +++ b/transformers/modeling_distilbert.py @@ -37,14 +37,15 @@ from .configuration_distilbert import DistilBertConfig from .file_utils import add_start_docstrings import logging + logger = logging.getLogger(__name__) DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", - 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin", - 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin", - 'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin", + "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", + "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin", + "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-pytorch_model.bin", + "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-pytorch_model.bin", } @@ -52,26 +53,24 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { def gelu(x): return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0))) + def create_sinusoidal_embeddings(n_pos, dim, out): - position_enc = np.array([ - [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] - for pos in range(n_pos) - ]) + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() out.requires_grad = False + class Embeddings(nn.Module): - def __init__(self, - config): + def __init__(self, config): super(Embeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: - create_sinusoidal_embeddings(n_pos=config.max_position_embeddings, - dim=config.dim, - out=self.position_embeddings.weight) + create_sinusoidal_embeddings( + n_pos=config.max_position_embeddings, dim=config.dim, out=self.position_embeddings.weight + ) self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) self.dropout = nn.Dropout(config.dropout) @@ -89,17 +88,18 @@ class Embeddings(nn.Module): The embedded tokens (plus position embeddings, no token_type embeddings) """ seq_length = input_ids.size(1) - position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) - position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) - word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) - position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) + word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) + position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) - embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) - embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) + embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) + embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) return embeddings + class MultiHeadSelfAttention(nn.Module): def __init__(self, config): super(MultiHeadSelfAttention, self).__init__() @@ -139,7 +139,7 @@ class MultiHeadSelfAttention(nn.Module): self.dim = attention_head_size * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) - def forward(self, query, key, value, mask, head_mask = None): + def forward(self, query, key, value, mask, head_mask=None): """ Parameters ---------- @@ -172,39 +172,42 @@ class MultiHeadSelfAttention(nn.Module): """ group heads """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) - q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) - k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) - v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) - q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) - scores = torch.matmul(q, k.transpose(2,3)) # (bs, n_heads, q_length, k_length) - mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) - scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length) + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, q_length, k_length) + mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) + scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, q_length, k_length) - weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) - weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) + weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) + weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask - context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) - context = unshape(context) # (bs, q_length, dim) - context = self.out_lin(context) # (bs, q_length, dim) + context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) + context = unshape(context) # (bs, q_length, dim) + context = self.out_lin(context) # (bs, q_length, dim) if self.output_attentions: return (context, weights) else: return (context,) + class FFN(nn.Module): def __init__(self, config): super(FFN, self).__init__() self.dropout = nn.Dropout(p=config.dropout) self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) - assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation) - self.activation = gelu if config.activation == 'gelu' else nn.ReLU() + assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format( + config.activation + ) + self.activation = gelu if config.activation == "gelu" else nn.ReLU() def forward(self, input): x = self.lin1(input) @@ -213,6 +216,7 @@ class FFN(nn.Module): x = self.dropout(x) return x + class TransformerBlock(nn.Module): def __init__(self, config): super(TransformerBlock, self).__init__() @@ -249,14 +253,14 @@ class TransformerBlock(nn.Module): # Self-Attention sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask) if self.output_attentions: - sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) - else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples + sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) + else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples assert type(sa_output) == tuple sa_output = sa_output[0] - sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) + sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network - ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) + ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) @@ -303,9 +307,7 @@ class Transformer(nn.Module): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) - layer_outputs = layer_module(x=hidden_state, - attn_mask=attn_mask, - head_mask=head_mask[i]) + layer_outputs = layer_module(x=hidden_state, attn_mask=attn_mask, head_mask=head_mask[i]) hidden_state = layer_outputs[-1] if self.output_attentions: @@ -332,6 +334,7 @@ class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ + config_class = DistilBertConfig pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = None @@ -396,8 +399,12 @@ DISTILBERT_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class DistilBertModel(DistilBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -420,11 +427,12 @@ class DistilBertModel(DistilBertPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(DistilBertModel, self).__init__(config) - self.embeddings = Embeddings(config) # Embeddings - self.transformer = Transformer(config) # Encoder + self.embeddings = Embeddings(config) # Embeddings + self.transformer = Transformer(config) # Encoder self.init_weights() @@ -442,8 +450,7 @@ class DistilBertModel(DistilBertPreTrainedModel): for layer, heads in heads_to_prune.items(): self.transformer.layer[layer].attention.prune_heads(heads) - def forward(self, - input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None): + def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -456,7 +463,7 @@ class DistilBertModel(DistilBertPreTrainedModel): device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: - attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) + attention_mask = torch.ones(input_shape, device=device) # (bs, seq_length) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -468,24 +475,29 @@ class DistilBertModel(DistilBertPreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.num_hidden_layers if inputs_embeds is None: - inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) - tfmr_output = self.transformer(x=inputs_embeds, - attn_mask=attention_mask, - head_mask=head_mask) + inputs_embeds = self.embeddings(input_ids) # (bs, seq_length, dim) + tfmr_output = self.transformer(x=inputs_embeds, attn_mask=attention_mask, head_mask=head_mask) hidden_state = tfmr_output[0] - output = (hidden_state, ) + tfmr_output[1:] + output = (hidden_state,) + tfmr_output[1:] - return output # last-layer hidden-state, (all hidden_states), (all attentions) + return output # last-layer hidden-state, (all hidden_states), (all attentions) -@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """, - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """DistilBert Model with a `masked language modeling` head on top. """, + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class DistilBertForMaskedLM(DistilBertPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -516,6 +528,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): loss, prediction_scores = outputs[:2] """ + def __init__(self, config): super(DistilBertForMaskedLM, self).__init__(config) self.output_attentions = config.output_attentions @@ -534,28 +547,31 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): return self.vocab_projector def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None): - dlbrt_output = self.distilbert(input_ids=input_ids, - attention_mask=attention_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) - hidden_states = dlbrt_output[0] # (bs, seq_length, dim) - prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) - prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) - prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) + dlbrt_output = self.distilbert( + input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds + ) + hidden_states = dlbrt_output[0] # (bs, seq_length, dim) + prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) + prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) - outputs = (prediction_logits, ) + dlbrt_output[1:] + outputs = (prediction_logits,) + dlbrt_output[1:] if masked_lm_labels is not None: - mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), - masked_lm_labels.view(-1)) - outputs = (mlm_loss,) + outputs + mlm_loss = self.mlm_loss_fct( + prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1) + ) + outputs = (mlm_loss,) + outputs - return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) + return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) -@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class DistilBertForSequenceClassification(DistilBertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -587,6 +603,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(DistilBertForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels @@ -599,16 +616,15 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): self.init_weights() def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None): - distilbert_output = self.distilbert(input_ids=input_ids, - attention_mask=attention_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) - hidden_state = distilbert_output[0] # (bs, seq_len, dim) - pooled_output = hidden_state[:, 0] # (bs, dim) - pooled_output = self.pre_classifier(pooled_output) # (bs, dim) - pooled_output = nn.ReLU()(pooled_output) # (bs, dim) - pooled_output = self.dropout(pooled_output) # (bs, dim) - logits = self.classifier(pooled_output) # (bs, dim) + distilbert_output = self.distilbert( + input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds + ) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = nn.ReLU()(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output) # (bs, dim) + logits = self.classifier(pooled_output) # (bs, dim) outputs = (logits,) + distilbert_output[1:] if labels is not None: @@ -623,9 +639,12 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) -@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -663,6 +682,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): loss, start_scores, end_scores = outputs[:3] """ + def __init__(self, config): super(DistilBertForQuestionAnswering, self).__init__(config) @@ -672,19 +692,26 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): self.dropout = nn.Dropout(config.qa_dropout) self.init_weights() - - def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None): - distilbert_output = self.distilbert(input_ids=input_ids, - attention_mask=attention_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) - hidden_states = distilbert_output[0] # (bs, max_query_len, dim) - hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) - logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + ): + distilbert_output = self.distilbert( + input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds + ) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) + + hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) + logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) # (bs, max_query_len) - end_logits = end_logits.squeeze(-1) # (bs, max_query_len) + start_logits = start_logits.squeeze(-1) # (bs, max_query_len) + end_logits = end_logits.squeeze(-1) # (bs, max_query_len) outputs = (start_logits, end_logits,) + distilbert_output[1:] if start_positions is not None and end_positions is not None: @@ -707,10 +734,12 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) -@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - DISTILBERT_START_DOCSTRING, - DISTILBERT_INPUTS_DOCSTRING) + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class DistilBertForTokenClassification(DistilBertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -740,6 +769,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): loss, scores = outputs[:2] """ + def __init__(self, config): super(DistilBertForTokenClassification, self).__init__(config) self.num_labels = config.num_labels @@ -750,13 +780,11 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, head_mask=None, - inputs_embeds=None, labels=None): + def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None): - outputs = self.distilbert(input_ids, - attention_mask=attention_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.distilbert( + input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds + ) sequence_output = outputs[0] diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index ddfebdc393..e5bad422c4 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -145,16 +145,12 @@ class PreTrainedEncoderDecoder(nn.Module): # by the value of the flag `is_decoder` that we need to set correctly. encoder = kwargs_encoder.pop("model", None) if encoder is None: - encoder = AutoModel.from_pretrained( - encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder - ) + encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder) encoder.config.is_decoder = False decoder = kwargs_decoder.pop("model", None) if decoder is None: - decoder = AutoModelWithLMHead.from_pretrained( - decoder_pretrained_model_name_or_path, **kwargs_decoder - ) + decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) decoder.config.is_decoder = True model = cls(encoder, decoder) @@ -168,18 +164,23 @@ class PreTrainedEncoderDecoder(nn.Module): We save the encoder' and decoder's parameters in two separate directories. """ - # If the root output directory does not exist, create it + # If the root output directory does not exist, create it if not os.path.exists(save_directory): os.mkdir(save_directory) # Check whether the output directory is empty or not - sub_directories = [directory for directory in os.listdir(save_directory) - if os.path.isdir(os.path.join(save_directory, directory))] + sub_directories = [ + directory + for directory in os.listdir(save_directory) + if os.path.isdir(os.path.join(save_directory, directory)) + ] if len(sub_directories) > 0: if "encoder" in sub_directories and "decoder" in sub_directories: - print("WARNING: there is an older version of encoder-decoder saved in" +\ - " the output directory. The default behaviour is to overwrite them.") + print( + "WARNING: there is an older version of encoder-decoder saved in" + + " the output directory. The default behaviour is to overwrite them." + ) # Empty the output directory for directory_to_remove in sub_directories: @@ -190,7 +191,7 @@ class PreTrainedEncoderDecoder(nn.Module): # Remove the subdirectory itself os.rmdir(os.path.join(save_directory, directory_to_remove)) - assert(len(os.listdir(save_directory)) == 0) # sanity check + assert len(os.listdir(save_directory)) == 0 # sanity check # Create the "encoder" directory inside the output directory and save the encoder into it if not os.path.exists(os.path.join(save_directory, "encoder")): diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py index 3a7561ca58..fe8a973f0b 100644 --- a/transformers/modeling_gpt2.py +++ b/transformers/modeling_gpt2.py @@ -36,11 +36,14 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin", - "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin", - "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin", - "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin", - "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",} +GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin", + "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-pytorch_model.bin", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin", +} + def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): """ Load tf checkpoints in a pytorch model @@ -50,8 +53,10 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): import numpy as np import tensorflow as tf except ImportError: - logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) raise tf_path = os.path.abspath(gpt2_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) @@ -67,20 +72,20 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): for name, array in zip(names, arrays): name = name[6:] # skip "model/" - name = name.split('/') + name = name.split("/") pointer = model for m_name in name: - if re.fullmatch(r'[A-Za-z]+\d+', m_name): - l = re.split(r'(\d+)', m_name) + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + l = re.split(r"(\d+)", m_name) else: l = [m_name] - if l[0] == 'w' or l[0] == 'g': - pointer = getattr(pointer, 'weight') - elif l[0] == 'b': - pointer = getattr(pointer, 'bias') - elif l[0] == 'wpe' or l[0] == 'wte': + if l[0] == "w" or l[0] == "g": + pointer = getattr(pointer, "weight") + elif l[0] == "b": + pointer = getattr(pointer, "bias") + elif l[0] == "wpe" or l[0] == "wte": pointer = getattr(pointer, l[0]) - pointer = getattr(pointer, 'weight') + pointer = getattr(pointer, "weight") else: pointer = getattr(pointer, l[0]) if len(l) >= 2: @@ -130,7 +135,7 @@ class Attention(nn.Module): mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() - index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)]) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) # Prune conv1d layers self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) @@ -146,7 +151,7 @@ class Attention(nn.Module): if self.scale: w = w / math.sqrt(v.size(-1)) nd, ns = w.size(-2), w.size(-1) - b = self.bias[:, :, ns-nd:ns, :ns] + b = self.bias[:, :, ns - nd : ns, :ns] w = w * b - 1e4 * (1 - b) if attention_mask is not None: @@ -226,10 +231,9 @@ class Block(nn.Module): self.mlp = MLP(4 * nx, config) def forward(self, x, layer_past=None, attention_mask=None, head_mask=None): - output_attn = self.attn(self.ln_1(x), - layer_past=layer_past, - attention_mask=attention_mask, - head_mask=head_mask) + output_attn = self.attn( + self.ln_1(x), layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask + ) a = output_attn[0] # output_attn: a, present, (attentions) x = x + a @@ -244,6 +248,7 @@ class GPT2PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = GPT2Config pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_gpt2 @@ -321,8 +326,12 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", - GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", + GPT2_START_DOCSTRING, + GPT2_INPUTS_DOCSTRING, +) class GPT2Model(GPT2PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -350,6 +359,7 @@ class GPT2Model(GPT2PreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(GPT2Model, self).__init__(config) self.output_hidden_states = config.output_hidden_states @@ -377,7 +387,16 @@ class GPT2Model(GPT2PreTrainedModel): for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) - def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None): + def forward( + self, + input_ids=None, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -418,7 +437,7 @@ class GPT2Model(GPT2PreTrainedModel): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed @@ -430,8 +449,12 @@ class GPT2Model(GPT2PreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.n_layer @@ -454,10 +477,9 @@ class GPT2Model(GPT2PreTrainedModel): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) - outputs = block(hidden_states, - layer_past=layer_past, - attention_mask=attention_mask, - head_mask=head_mask[i]) + outputs = block( + hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i] + ) hidden_states, present = outputs[:2] if self.output_past: @@ -486,8 +508,12 @@ class GPT2Model(GPT2PreTrainedModel): return outputs # last hidden state, (presents), (all hidden_states), (attentions) -@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top -(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) +@add_start_docstrings( + """The GPT2 Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, + GPT2_START_DOCSTRING, + GPT2_INPUTS_DOCSTRING, +) class GPT2LMHeadModel(GPT2PreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -528,6 +554,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(GPT2LMHeadModel, self).__init__(config) self.transformer = GPT2Model(config) @@ -538,15 +565,26 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - labels=None): - transformer_outputs = self.transformer(input_ids, - past=past, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) @@ -558,18 +596,21 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) -@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification +@add_start_docstrings( + """The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). -""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) +""", + GPT2_START_DOCSTRING, + GPT2_INPUTS_DOCSTRING, +) class GPT2DoubleHeadsModel(GPT2PreTrainedModel): r""" **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``: @@ -632,6 +673,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): lm_prediction_scores, mc_prediction_scores = outputs[:2] """ + def __init__(self, config): super(GPT2DoubleHeadsModel, self).__init__(config) config.num_labels = 1 @@ -644,15 +686,28 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - mc_token_ids=None, lm_labels=None, mc_labels=None): - transformer_outputs = self.transformer(input_ids, - past=past, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + mc_token_ids=None, + lm_labels=None, + mc_labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) hidden_states = transformer_outputs[0] @@ -662,15 +717,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): outputs = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_labels is not None: loss_fct = CrossEntropyLoss() - loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), - mc_labels.view(-1)) + loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions) diff --git a/transformers/modeling_mmbt.py b/transformers/modeling_mmbt.py index 79a717ba2a..1c173ac692 100644 --- a/transformers/modeling_mmbt.py +++ b/transformers/modeling_mmbt.py @@ -15,8 +15,7 @@ # limitations under the License. """PyTorch MMBT model. """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging @@ -32,6 +31,7 @@ logger = logging.getLogger(__name__) class ModalEmbeddings(nn.Module): """Generic Modal Embeddings which takes in an encoder, and a transformer embedding. """ + def __init__(self, config, encoder, embeddings): super(ModalEmbeddings, self).__init__() self.config = config @@ -62,7 +62,9 @@ class ModalEmbeddings(nn.Module): position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length) if token_type_ids is None: - token_type_ids = torch.zeros((input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device) + token_type_ids = torch.zeros( + (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device + ) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) @@ -140,8 +142,12 @@ MMBT_INPUTS_DOCSTRING = r""" Inputs: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. """ -@add_start_docstrings("The bare MMBT Model outputting raw hidden-states without any specific head on top.", - MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare MMBT Model outputting raw hidden-states without any specific head on top.", + MMBT_START_DOCSTRING, + MMBT_INPUTS_DOCSTRING, +) class MMBTModel(nn.Module): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -167,19 +173,29 @@ class MMBTModel(nn.Module): encoder = ImageEncoder(args) mmbt = MMBTModel(config, transformer, encoder) """ + def __init__(self, config, transformer, encoder): super(MMBTModel, self).__init__() self.config = config self.transformer = transformer self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings) - def forward(self, input_modal, input_ids=None, modal_start_tokens=None, - modal_end_tokens=None, attention_mask=None, - token_type_ids=None, modal_token_type_ids=None, - position_ids=None, modal_position_ids=None, head_mask=None, - inputs_embeds=None, encoder_hidden_states=None, - encoder_attention_mask=None): - + def forward( + self, + input_modal, + input_ids=None, + modal_start_tokens=None, + modal_end_tokens=None, + attention_mask=None, + token_type_ids=None, + modal_token_type_ids=None, + position_ids=None, + modal_position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -192,21 +208,22 @@ class MMBTModel(nn.Module): device = input_ids.device if input_ids is not None else inputs_embeds.device - modal_embeddings = self.modal_encoder(input_modal, - start_token=modal_start_tokens, - end_token=modal_end_tokens, - position_ids=modal_position_ids, - token_type_ids=modal_token_type_ids) + modal_embeddings = self.modal_encoder( + input_modal, + start_token=modal_start_tokens, + end_token=modal_end_tokens, + position_ids=modal_position_ids, + token_type_ids=modal_token_type_ids, + ) input_modal_shape = modal_embeddings.size()[:-1] if token_type_ids is None: token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device) - txt_embeddings = self.transformer.embeddings(input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids, - inputs_embeds=inputs_embeds) + txt_embeddings = self.transformer.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1) @@ -215,12 +232,16 @@ class MMBTModel(nn.Module): if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) else: - attention_mask = torch.cat([torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1) + attention_mask = torch.cat( + [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1 + ) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(input_shape, device=device) else: - encoder_attention_mask = torch.cat([torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1) + encoder_attention_mask = torch.cat( + [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1 + ) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. @@ -254,7 +275,9 @@ class MMBTModel(nn.Module): if encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + encoder_extended_attention_mask = encoder_extended_attention_mask.to( + dtype=next(self.parameters()).dtype + ) # fp16 compatibility encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 # Prepare head mask if needed @@ -267,25 +290,31 @@ class MMBTModel(nn.Module): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.num_hidden_layers - - encoder_outputs = self.transformer.encoder(embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask) + encoder_outputs = self.transformer.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + ) sequence_output = encoder_outputs[0] pooled_output = self.transformer.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + outputs = (sequence_output, pooled_output,) + encoder_outputs[ + 1: + ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) - def get_input_embeddings(self): return self.embeddings.word_embeddings @@ -293,8 +322,12 @@ class MMBTModel(nn.Module): self.embeddings.word_embeddings = value -@add_start_docstrings("""MMBT Model with a sequence classification/regression head on top (a linear layer on top of - the pooled output)""", MMBT_START_DOCSTRING, MMBT_INPUTS_DOCSTRING) +@add_start_docstrings( + """MMBT Model with a sequence classification/regression head on top (a linear layer on top of + the pooled output)""", + MMBT_START_DOCSTRING, + MMBT_INPUTS_DOCSTRING, +) class MMBTForClassification(nn.Module): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -333,20 +366,35 @@ class MMBTForClassification(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - def forward(self, input_modal, input_ids=None, modal_start_tokens=None, modal_end_tokens=None, - attention_mask=None, token_type_ids=None, modal_token_type_ids=None, position_ids=None, - modal_position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_modal, + input_ids=None, + modal_start_tokens=None, + modal_end_tokens=None, + attention_mask=None, + token_type_ids=None, + modal_token_type_ids=None, + position_ids=None, + modal_position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): - outputs = self.mmbt(input_modal=input_modal, input_ids=input_ids, - modal_start_tokens=modal_start_tokens, - modal_end_tokens=modal_end_tokens, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - modal_token_type_ids=modal_token_type_ids, - position_ids=position_ids, - modal_position_ids=modal_position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.mmbt( + input_modal=input_modal, + input_ids=input_ids, + modal_start_tokens=modal_start_tokens, + modal_end_tokens=modal_end_tokens, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + modal_token_type_ids=modal_token_type_ids, + position_ids=position_ids, + modal_position_ids=modal_position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) pooled_output = outputs[1] @@ -365,4 +413,4 @@ class MMBTForClassification(nn.Module): loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs - return outputs # (loss), logits, (hidden_states), (attentions) \ No newline at end of file + return outputs # (loss), logits, (hidden_states), (attentions) diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py index 2f08b4093d..ed746ecac8 100644 --- a/transformers/modeling_openai.py +++ b/transformers/modeling_openai.py @@ -36,7 +36,9 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"} +OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = { + "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin" +} def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): @@ -45,17 +47,17 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): import re import numpy as np - if '.ckpt' in openai_checkpoint_folder_path: + if ".ckpt" in openai_checkpoint_folder_path: openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path) logger.info("Loading weights from {}".format(openai_checkpoint_folder_path)) - with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle: + with open(openai_checkpoint_folder_path + "/parameters_names.json", "r", encoding="utf-8") as names_handle: names = json.load(names_handle) - with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle: + with open(openai_checkpoint_folder_path + "/params_shapes.json", "r", encoding="utf-8") as shapes_handle: shapes = json.load(shapes_handle) offsets = np.cumsum([np.prod(shape) for shape in shapes]) - init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)] + init_params = [np.load(openai_checkpoint_folder_path + "/params_{}.npy".format(n)) for n in range(10)] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] @@ -79,23 +81,23 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): init_params.pop(0) init_params.pop(0) - for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]): + for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]): name = name[6:] # skip "model/" assert name[-2:] == ":0" name = name[:-2] - name = name.split('/') + name = name.split("/") pointer = model for m_name in name: - if re.fullmatch(r'[A-Za-z]+\d+', m_name): - l = re.split(r'(\d+)', m_name) + if re.fullmatch(r"[A-Za-z]+\d+", m_name): + l = re.split(r"(\d+)", m_name) else: l = [m_name] - if l[0] == 'g': - pointer = getattr(pointer, 'weight') - elif l[0] == 'b': - pointer = getattr(pointer, 'bias') - elif l[0] == 'w': - pointer = getattr(pointer, 'weight') + if l[0] == "g": + pointer = getattr(pointer, "weight") + elif l[0] == "b": + pointer = getattr(pointer, "bias") + elif l[0] == "w": + pointer = getattr(pointer, "weight") else: pointer = getattr(pointer, l[0]) if len(l) >= 2: @@ -156,7 +158,7 @@ class Attention(nn.Module): mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index = torch.arange(len(mask))[mask].long() - index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)]) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) # Prune conv1d layers self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) @@ -172,7 +174,7 @@ class Attention(nn.Module): # w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights # XD: self.b may be larger than w, so we need to crop it b = self.bias[:, :, : w.size(-2), : w.size(-1)] - w = w * b + - 1e4 * (1 - b) + w = w * b + -1e4 * (1 - b) if attention_mask is not None: # Apply the attention mask @@ -261,6 +263,7 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = OpenAIGPTConfig pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_openai_gpt @@ -330,8 +333,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.", - OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.", + OPENAI_GPT_START_DOCSTRING, + OPENAI_GPT_INPUTS_DOCSTRING, +) class OpenAIGPTModel(OpenAIGPTPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -354,6 +361,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(OpenAIGPTModel, self).__init__(config) self.output_attentions = config.output_attentions @@ -379,7 +387,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): for layer, heads in heads_to_prune.items(): self.h[layer].attn.prune_heads(heads) - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + ): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -410,7 +426,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed @@ -422,8 +438,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.n_layer @@ -463,8 +483,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): return outputs # last hidden state, (all hidden states), (all attentions) -@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top -(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) +@add_start_docstrings( + """OpenAI GPT Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, + OPENAI_GPT_START_DOCSTRING, + OPENAI_GPT_INPUTS_DOCSTRING, +) class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -496,6 +520,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(OpenAIGPTLMHeadModel, self).__init__(config) self.transformer = OpenAIGPTModel(config) @@ -506,14 +531,24 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - labels=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) @@ -524,18 +559,21 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, (all hidden states), (all attentions) -@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification +@add_start_docstrings( + """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). -""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) +""", + OPENAI_GPT_START_DOCSTRING, + OPENAI_GPT_INPUTS_DOCSTRING, +) class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): r""" **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``: @@ -587,6 +625,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): lm_prediction_scores, mc_prediction_scores = outputs[:2] """ + def __init__(self, config): super(OpenAIGPTDoubleHeadsModel, self).__init__(config) @@ -600,14 +639,26 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - mc_token_ids=None, lm_labels=None, mc_labels=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + mc_token_ids=None, + lm_labels=None, + mc_labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) @@ -616,15 +667,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): outputs = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_labels is not None: loss_fct = CrossEntropyLoss() - loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), - mc_labels.view(-1)) + loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index 4faab46f7a..730058ea92 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -15,8 +15,7 @@ # limitations under the License. """PyTorch RoBERTa model. """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging @@ -31,24 +30,27 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin", - 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin", - 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", - 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin", - 'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin", - 'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin", + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin", + "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-openai-detector-pytorch_model.bin", + "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-openai-detector-pytorch_model.bin", } + class RobertaEmbeddings(BertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ + def __init__(self, config): super(RobertaEmbeddings, self).__init__(config) self.padding_idx = 1 self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, - padding_idx=self.padding_idx) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if position_ids is None: @@ -58,10 +60,9 @@ class RobertaEmbeddings(BertEmbeddings): else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - return super(RobertaEmbeddings, self).forward(input_ids, - token_type_ids=token_type_ids, - position_ids=position_ids, - inputs_embeds=inputs_embeds) + return super(RobertaEmbeddings, self).forward( + input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds + ) def create_position_ids_from_input_ids(self, x): """ Replace non-padding symbols with their position numbers. Position numbers begin at @@ -85,8 +86,9 @@ class RobertaEmbeddings(BertEmbeddings): input_shape = inputs_embeds.size()[:-1] sequence_length = input_shape[1] - position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long, - device=inputs_embeds.device) + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) return position_ids.unsqueeze(0).expand(input_shape) @@ -162,8 +164,12 @@ ROBERTA_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class RobertaModel(BertModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -209,8 +215,10 @@ class RobertaModel(BertModel): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value -@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + +@add_start_docstrings( + """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING +) class RobertaForMaskedLM(BertPreTrainedModel): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -256,14 +264,24 @@ class RobertaForMaskedLM(BertPreTrainedModel): def get_output_embeddings(self): return self.lm_head.decoder - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None): - outputs = self.roberta(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + masked_lm_labels=None, + ): + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) @@ -299,9 +317,12 @@ class RobertaLMHead(nn.Module): return x -@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer +@add_start_docstrings( + """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class RobertaForSequenceClassification(BertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -343,15 +364,25 @@ class RobertaForSequenceClassification(BertPreTrainedModel): self.roberta = RobertaModel(config) self.classifier = RobertaClassificationHead(config) - - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, - labels=None): - outputs = self.roberta(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] logits = self.classifier(sequence_output) @@ -369,9 +400,12 @@ class RobertaForSequenceClassification(BertPreTrainedModel): return outputs # (loss), logits, (hidden_states), (attentions) -@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of +@add_start_docstrings( + """Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class RobertaForMultipleChoice(BertPreTrainedModel): r""" Inputs: @@ -455,16 +489,29 @@ class RobertaForMultipleChoice(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, - position_ids=None, head_mask=None, inputs_embeds=None): + def forward( + self, + input_ids=None, + token_type_ids=None, + attention_mask=None, + labels=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + ): num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None - outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, - attention_mask=flat_attention_mask, head_mask=head_mask) + outputs = self.roberta( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + head_mask=head_mask, + ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) @@ -481,9 +528,12 @@ class RobertaForMultipleChoice(BertPreTrainedModel): return outputs # (loss), reshaped_logits, (hidden_states), (attentions) -@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class RobertaForTokenClassification(BertPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -527,15 +577,25 @@ class RobertaForTokenClassification(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, - position_ids=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): - outputs = self.roberta(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] @@ -577,9 +637,12 @@ class RobertaClassificationHead(nn.Module): return x -@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class RobertaForQuestionAnswering(BertPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -626,14 +689,24 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): self.init_weights() - def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, - start_positions=None, end_positions=None): + def forward( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + start_positions=None, + end_positions=None, + ): - outputs = self.roberta(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask) + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + ) sequence_output = outputs[0] @@ -660,4 +733,4 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) \ No newline at end of file + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 9baf69d02b..2ee8cd011b 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -41,11 +41,11 @@ logger = logging.getLogger(__name__) # for the pretrained weights provided with the models #################################################### T5_PRETRAINED_MODEL_ARCHIVE_MAP = { - 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin", - 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin", - 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin", - 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin", - 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin", + "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin", + "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin", + "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin", + "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin", + "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin", } #################################################### @@ -60,8 +60,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): import numpy as np import tensorflow as tf except ImportError: - logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) raise tf_path = os.path.abspath(tf_checkpoint_path) logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) @@ -76,26 +78,26 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): tf_weights[name] = array for txt_name in names: - name = txt_name.split('/') + name = txt_name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m", "global_step"] for n in name): logger.info("Skipping {}".format("/".join(name))) tf_weights.pop(txt_name, None) continue - if '_slot_' in name[-1]: + if "_slot_" in name[-1]: logger.info("Skipping {}".format("/".join(name))) tf_weights.pop(txt_name, None) continue pointer = model array = tf_weights[txt_name] for m_name in name: - if re.fullmatch(r'[A-Za-z]+_\d+', m_name): - l = re.split(r'_(\d+)', m_name) + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + l = re.split(r"_(\d+)", m_name) else: l = [m_name] - if l[0] in ['kernel', 'scale', 'embedding']: - pointer = getattr(pointer, 'weight') + if l[0] in ["kernel", "scale", "embedding"]: + pointer = getattr(pointer, "weight") # elif l[0] == 'scale': # pointer = getattr(pointer, 'weight') # elif l[0] == 'output_bias' or l[0] == 'beta': @@ -111,9 +113,9 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): if len(l) >= 2: num = int(l[1]) pointer = pointer[num] - if l[0] not in ['kernel', 'scale', 'embedding']: - pointer = getattr(pointer, 'weight') - if l[0] != 'embedding': + if l[0] not in ["kernel", "scale", "embedding"]: + pointer = getattr(pointer, "weight") + if l[0] != "embedding": logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name)) array = np.transpose(array) try: @@ -125,7 +127,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): pointer.data = torch.from_numpy(array.astype(np.float32)) tf_weights.pop(txt_name, None) - logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) return model @@ -136,6 +138,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module) #################################################### + class T5LayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ Construct a layernorm module in the T5 style @@ -228,10 +231,7 @@ class T5Attention(nn.Module): self.pruned_heads = self.pruned_heads.union(heads) @staticmethod - def _relative_position_bucket(relative_position, - bidirectional=True, - num_buckets=32, - max_distance=128): + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 @@ -267,12 +267,12 @@ class T5Attention(nn.Module): # half of the buckets are for exact increments in positions max_exact = num_buckets // 2 - is_small = (n < max_exact) + is_small = n < max_exact # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance val_if_large = max_exact + ( - torch.log(n.float() / max_exact) - / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long) + torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) + ).to(torch.long) val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) ret += torch.where(is_small, n, val_if_large) @@ -283,11 +283,13 @@ class T5Attention(nn.Module): context_position = torch.arange(qlen, dtype=torch.long)[:, None] memory_position = torch.arange(klen, dtype=torch.long)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) - rp_bucket = self._relative_position_bucket(relative_position, # shape (qlen, klen) - bidirectional=not self.is_decoder, - num_buckets=self.relative_attention_num_buckets) + rp_bucket = self._relative_position_bucket( + relative_position, # shape (qlen, klen) + bidirectional=not self.is_decoder, + num_buckets=self.relative_attention_num_buckets, + ) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) - values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) + values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) return values def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None): @@ -298,7 +300,7 @@ class T5Attention(nn.Module): # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = input.size() if kv is None: - klen = qlen if cache is None else cache['slen'] + qlen + klen = qlen if cache is None else cache["slen"] + qlen else: klen = kv.size(1) @@ -310,45 +312,45 @@ class T5Attention(nn.Module): """ compute context """ return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim) - q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) + q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: - k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv - k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] - k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) - v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) + k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) + v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) # q = q / math.sqrt(dim_per_head) # No scaling in T5 - scores = torch.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen) + scores = torch.einsum("bnqd,bnkd->bnqk", q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(qlen, klen) if mask is not None: - position_bias = position_bias + mask # (bs, n_heads, qlen, klen) + position_bias = position_bias + mask # (bs, n_heads, qlen, klen) scores += position_bias - weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) + weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask - context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) - context = unshape(context) # (bs, qlen, dim) + context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) context = self.o(context) @@ -369,10 +371,9 @@ class T5LayerSelfAttention(nn.Module): def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None): norm_x = self.layer_norm(hidden_states) - attention_output = self.SelfAttention(norm_x, - mask=attention_mask, - position_bias=position_bias, - head_mask=head_mask) + attention_output = self.SelfAttention( + norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask + ) y = attention_output[0] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -388,11 +389,9 @@ class T5LayerCrossAttention(nn.Module): def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None): norm_x = self.layer_norm(hidden_states) - attention_output = self.EncDecAttention(norm_x, - mask=attention_mask, - kv=kv, - position_bias=position_bias, - head_mask=head_mask) + attention_output = self.EncDecAttention( + norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask + ) y = attention_output[0] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -411,26 +410,36 @@ class T5Block(nn.Module): else: self.layer.append(T5LayerFF(config)) - def forward(self, hidden_states, attention_mask=None, position_bias=None, - encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, - head_mask=None): - self_attention_outputs = self.layer[0](hidden_states, - attention_mask=attention_mask, - position_bias=position_bias, - head_mask=head_mask) + def forward( + self, + hidden_states, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + head_mask=None, + ): + self_attention_outputs = self.layer[0]( + hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask + ) hidden_states = self_attention_outputs[0] outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights if not self.is_decoder: hidden_states = self.layer[1](hidden_states) else: - cross_attention_outputs = self.layer[1](hidden_states, - kv=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - head_mask=head_mask) + cross_attention_outputs = self.layer[1]( + hidden_states, + kv=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + head_mask=head_mask, + ) hidden_states = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[1:] # Keep cross-attention outputs and relative position weights + outputs = ( + outputs + cross_attention_outputs[1:] + ) # Keep cross-attention outputs and relative position weights hidden_states = self.layer[2](hidden_states) outputs = (hidden_states,) + outputs # add attentions if we output them @@ -441,6 +450,7 @@ class T5PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = T5Config pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_t5 @@ -450,29 +460,31 @@ class T5PreTrainedModel(PreTrainedModel): def dummy_inputs(self): input_ids = torch.tensor(DUMMY_INPUTS) input_mask = torch.tensor(DUMMY_MASK) - dummy_inputs = {'decoder_input_ids': input_ids, - 'encoder_input_ids': input_ids, - 'decoder_attention_mask': input_mask} + dummy_inputs = { + "decoder_input_ids": input_ids, + "encoder_input_ids": input_ids, + "decoder_attention_mask": input_mask, + } return dummy_inputs def _init_weights(self, module): """ Initialize the weights """ factor = self.config.initializer_factor # Used for testing weights initialization if isinstance(module, T5LayerNorm): - module.weight.data.fill_(factor*1.0) + module.weight.data.fill_(factor * 1.0) elif isinstance(module, (T5Model, T5WithLMHeadModel)): # Mesh TensorFlow embeddings initialization # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 - module.shared.weight.data.normal_(mean=0.0, std=factor*1.0) + module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0) elif isinstance(module, T5DenseReluDense): # Mesh TensorFlow FF initialization # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 - module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5)) - if hasattr(module.wi, 'bias') and module.wi.bias is not None: + module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5)) + if hasattr(module.wi, "bias") and module.wi.bias is not None: module.wi.bias.data.zero_() - module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5)) - if hasattr(module.wo, 'bias') and module.wo.bias is not None: + module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, "bias") and module.wo.bias is not None: module.wo.bias.data.zero_() elif isinstance(module, T5Attention): # Mesh TensorFlow attention initialization to avoid scaling before softmax @@ -480,12 +492,12 @@ class T5PreTrainedModel(PreTrainedModel): d_model = self.config.d_model d_kv = self.config.d_kv n_heads = self.config.num_heads - module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5)) - module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5)) - module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5)) - module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5)) + module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * d_kv) ** -0.5)) + module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) + module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * d_kv) ** -0.5)) if module.has_relative_attention_bias: - module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5)) + module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) class T5Stack(T5PreTrainedModel): @@ -495,19 +507,22 @@ class T5Stack(T5PreTrainedModel): self.output_hidden_states = config.output_hidden_states self.is_decoder = config.is_decoder - self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) - for i in range(config.num_layers)]) + self.block = nn.ModuleList( + [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)] + ) self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) self.init_weights() - def forward(self, - hidden_states, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - head_mask=None): + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + ): batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1] if attention_mask is None: @@ -521,9 +536,9 @@ class T5Stack(T5PreTrainedModel): if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: - # Provided a padding mask of dimensions [batch_size, seq_length] - # - if the model is a decoder, apply a causal mask in addition to the padding mask - # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder: seq_ids = torch.arange(seq_length, device=hidden_states.device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] @@ -557,7 +572,9 @@ class T5Stack(T5PreTrainedModel): # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2)) - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + encoder_extended_attention_mask = encoder_extended_attention_mask.to( + dtype=next(self.parameters()).dtype + ) # fp16 compatibility encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: encoder_extended_attention_mask = None @@ -572,8 +589,12 @@ class T5Stack(T5PreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.config.num_layers @@ -587,13 +608,15 @@ class T5Stack(T5PreTrainedModel): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - layer_outputs = layer_module(hidden_states, - attention_mask=extended_attention_mask, - position_bias=position_bias, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - encoder_decoder_position_bias=encoder_decoder_position_bias, - head_mask=head_mask[i]) + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + head_mask=head_mask[i], + ) # layer_outputs is a tuple with: # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states = layer_outputs[0] @@ -672,9 +695,12 @@ T5_INPUTS_DOCSTRING = r""" ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ -@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states" - "without any specific head on top.", - T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.", + T5_START_DOCSTRING, + T5_INPUTS_DOCSTRING, +) class T5Model(T5PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -697,6 +723,7 @@ class T5Model(T5PreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(T5Model, self).__init__(config) self.shared = nn.Embedding(config.vocab_size, config.d_model) @@ -729,12 +756,13 @@ class T5Model(T5PreTrainedModel): # `encoder_`), decoder-specific (prefixed by `decoder_`) and those # that apply to the model as whole. # We let the specific kwargs override the common ones in case of conflict. - kwargs_common = dict((k, v) for k, v in kwargs.items() - if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_common = dict( + (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_") + ) kwargs_encoder = kwargs_common.copy() kwargs_decoder = kwargs_common.copy() - kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) - kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) + kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_"))) # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) @@ -770,8 +798,7 @@ class T5Model(T5PreTrainedModel): return decoder_outputs + encoder_outputs -@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, - T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) +@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) class T5WithLMHeadModel(T5PreTrainedModel): r""" **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -802,6 +829,7 @@ class T5WithLMHeadModel(T5PreTrainedModel): loss, prediction_scores = outputs[:2] """ + def __init__(self, config): super(T5WithLMHeadModel, self).__init__(config) self.model_dim = config.d_model @@ -834,14 +862,15 @@ class T5WithLMHeadModel(T5PreTrainedModel): # that apply to the model as whole. # We let the specific kwargs override the common ones in case of conflict. - lm_labels = kwargs.pop('decoder_lm_labels', None) + lm_labels = kwargs.pop("decoder_lm_labels", None) - kwargs_common = dict((k, v) for k, v in kwargs.items() - if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_common = dict( + (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_") + ) kwargs_encoder = kwargs_common.copy() kwargs_decoder = kwargs_common.copy() - kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) - kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) + kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_"))) # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) @@ -879,8 +908,9 @@ class T5WithLMHeadModel(T5PreTrainedModel): shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1)) - decoder_outputs = (loss,) + decoder_outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + decoder_outputs = ( + loss, + ) + decoder_outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 return decoder_outputs + encoder_outputs diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py index ac55a73fa3..25d0863987 100644 --- a/transformers/modeling_tf_albert.py +++ b/transformers/modeling_tf_albert.py @@ -31,14 +31,14 @@ import logging logger = logging.getLogger(__name__) TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5", - 'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5", - 'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5", - 'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5", - 'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5", - 'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5", - 'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5", - 'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5", + "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tf_model.h5", + "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tf_model.h5", + "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tf_model.h5", + "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tf_model.h5", + "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tf_model.h5", + "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tf_model.h5", + "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tf_model.h5", + "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tf_model.h5", } @@ -50,21 +50,22 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): super(TFAlbertEmbeddings, self).__init__(**kwargs) self.config = config - self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, - config.embedding_size, - embeddings_initializer=get_initializer( - self.config.initializer_range), - name='position_embeddings') - self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size, - config.embedding_size, - embeddings_initializer=get_initializer( - self.config.initializer_range), - name='token_type_embeddings') + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + config.embedding_size, + embeddings_initializer=get_initializer(self.config.initializer_range), + name="position_embeddings", + ) + self.token_type_embeddings = tf.keras.layers.Embedding( + config.type_vocab_size, + config.embedding_size, + embeddings_initializer=get_initializer(self.config.initializer_range), + name="token_type_embeddings", + ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name='LayerNorm') + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): @@ -75,7 +76,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): self.word_embeddings = self.add_weight( "weight", shape=[self.config.vocab_size, self.config.embedding_size], - initializer=get_initializer(self.config.initializer_range)) + initializer=get_initializer(self.config.initializer_range), + ) super(TFAlbertEmbeddings, self).build(input_shape) def call(self, inputs, mode="embedding", training=False): @@ -145,34 +147,29 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer): if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads assert config.hidden_size % config.num_attention_heads == 0 - self.attention_head_size = int( - config.hidden_size / config.num_attention_heads) + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense(self.all_head_size, - kernel_initializer=get_initializer( - config.initializer_range), - name='query') - self.key = tf.keras.layers.Dense(self.all_head_size, - kernel_initializer=get_initializer( - config.initializer_range), - name='key') - self.value = tf.keras.layers.Dense(self.all_head_size, - kernel_initializer=get_initializer( - config.initializer_range), - name='value') + self.query = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) - self.dropout = tf.keras.layers.Dropout( - config.attention_probs_dropout_prob) + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x, batch_size): - x = tf.reshape( - x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) return tf.transpose(x, perm=[0, 2, 1, 3]) def call(self, inputs, training=False): @@ -212,23 +209,21 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer): context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape(context_layer, - (batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size) + context_layer = tf.reshape( + context_layer, (batch_size, -1, self.all_head_size) + ) # (batch_size, seq_len_q, all_head_size) - outputs = (context_layer, attention_probs) if self.output_attentions else ( - context_layer,) + outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs class TFAlbertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFAlbertSelfOutput, self).__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer( - config.initializer_range), - name='dense') - self.LayerNorm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name='LayerNorm') + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): @@ -245,12 +240,10 @@ class TFAlbertAttention(TFBertSelfAttention): super(TFAlbertAttention, self).__init__(config, **kwargs) self.hidden_size = config.hidden_size - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer( - config.initializer_range), - name='dense') - self.LayerNorm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name='LayerNorm') + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.pruned_heads = set() def prune_heads(self, heads): @@ -293,11 +286,11 @@ class TFAlbertAttention(TFBertSelfAttention): context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape(context_layer, - (batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size) + context_layer = tf.reshape( + context_layer, (batch_size, -1, self.all_head_size) + ) # (batch_size, seq_len_q, all_head_size) - self_outputs = (context_layer, attention_probs) if self.output_attentions else ( - context_layer,) + self_outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) hidden_states = self_outputs[0] @@ -313,34 +306,35 @@ class TFAlbertAttention(TFBertSelfAttention): class TFAlbertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFAlbertLayer, self).__init__(**kwargs) - self.attention = TFAlbertAttention(config, name='attention') + self.attention = TFAlbertAttention(config, name="attention") - self.ffn = tf.keras.layers.Dense(config.intermediate_size, kernel_initializer=get_initializer( - config.initializer_range), name='ffn') + self.ffn = tf.keras.layers.Dense( + config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn" + ) if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act - self.ffn_output = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer( - config.initializer_range), name='ffn_output') + self.ffn_output = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output" + ) self.full_layer_layer_norm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name='full_layer_layer_norm') + epsilon=config.layer_norm_eps, name="full_layer_layer_norm" + ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs - attention_outputs = self.attention( - [hidden_states, attention_mask, head_mask], training=training) + attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) ffn_output = self.ffn(attention_outputs[0]) ffn_output = self.activation(ffn_output) ffn_output = self.ffn_output(ffn_output) hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.full_layer_layer_norm( - ffn_output + attention_outputs[0]) + hidden_states = self.full_layer_layer_norm(ffn_output + attention_outputs[0]) # add attentions if we output them outputs = (hidden_states,) + attention_outputs[1:] @@ -353,8 +347,9 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.albert_layers = [TFAlbertLayer(config, name="albert_layers_._{}".format( - i)) for i in range(config.inner_group_num)] + self.albert_layers = [ + TFAlbertLayer(config, name="albert_layers_._{}".format(i)) for i in range(config.inner_group_num) + ] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs @@ -363,8 +358,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): layer_attentions = () for layer_index, albert_layer in enumerate(self.albert_layers): - layer_output = albert_layer( - [hidden_states, attention_mask, head_mask[layer_index]], training=training) + layer_output = albert_layer([hidden_states, attention_mask, head_mask[layer_index]], training=training) hidden_states = layer_output[0] if self.output_attentions: @@ -389,10 +383,15 @@ class TFAlbertTransformer(tf.keras.layers.Layer): self.config = config self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.embedding_hidden_mapping_in = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer( - config.initializer_range), name='embedding_hidden_mapping_in') - self.albert_layer_groups = [TFAlbertLayerGroup( - config, name="albert_layer_groups_._{}".format(i)) for i in range(config.num_hidden_groups)] + self.embedding_hidden_mapping_in = tf.keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="embedding_hidden_mapping_in", + ) + self.albert_layer_groups = [ + TFAlbertLayerGroup(config, name="albert_layer_groups_._{}".format(i)) + for i in range(config.num_hidden_groups) + ] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs @@ -405,15 +404,19 @@ class TFAlbertTransformer(tf.keras.layers.Layer): for i in range(self.config.num_hidden_layers): # Number of layers in a hidden group - layers_per_group = int( - self.config.num_hidden_layers / self.config.num_hidden_groups) + layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups) # Index of the hidden group - group_idx = int( - i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) + group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) layer_group_output = self.albert_layer_groups[group_idx]( - [hidden_states, attention_mask, head_mask[group_idx*layers_per_group:(group_idx+1)*layers_per_group]], training=training) + [ + hidden_states, + attention_mask, + head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], + ], + training=training, + ) hidden_states = layer_group_output[0] if self.output_attentions: @@ -436,6 +439,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = AlbertConfig pretrained_model_archive_map = TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "albert" @@ -446,31 +450,25 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): super(TFAlbertMLMHead, self).__init__(**kwargs) self.vocab_size = config.vocab_size - self.dense = tf.keras.layers.Dense(config.embedding_size, - kernel_initializer=get_initializer( - config.initializer_range), - name='dense') + self.dense = tf.keras.layers.Dense( + config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): self.activation = ACT2FN[config.hidden_act] else: self.activation = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization( - epsilon=config.layer_norm_eps, name='LayerNorm') + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = input_embeddings def build(self, input_shape): - self.bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='bias') - self.decoder_bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='decoder/bias') + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + self.decoder_bias = self.add_weight( + shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" + ) super(TFAlbertMLMHead, self).build(input_shape) def call(self, hidden_states): @@ -560,8 +558,12 @@ ALBERT_INPUTS_DOCSTRING = r""" ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ -@add_start_docstrings("The bare Albert Model transformer outputing raw hidden-states without any specific head on top.", - ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare Albert Model transformer outputing raw hidden-states without any specific head on top.", + ALBERT_START_DOCSTRING, + ALBERT_INPUTS_DOCSTRING, +) class TFAlbertModel(TFAlbertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -601,8 +603,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel): self.embeddings = TFAlbertEmbeddings(config, name="embeddings") self.encoder = TFAlbertTransformer(config, name="encoder") - self.pooler = tf.keras.layers.Dense(config.hidden_size, kernel_initializer=get_initializer( - config.initializer_range), activation='tanh', name='pooler') + self.pooler = tf.keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="pooler", + ) def get_input_embeddings(self): return self.embeddings @@ -617,7 +623,16 @@ class TFAlbertModel(TFAlbertPreTrainedModel): """ raise NotImplementedError - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -627,12 +642,12 @@ class TFAlbertModel(TFAlbertPreTrainedModel): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs @@ -678,10 +693,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel): head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - embedding_output = self.embeddings( - [input_ids, position_ids, token_type_ids, inputs_embeds], training=training) - encoder_outputs = self.encoder( - [embedding_output, extended_attention_mask, head_mask], training=training) + embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output[:, 0]) @@ -692,8 +705,9 @@ class TFAlbertModel(TFAlbertPreTrainedModel): return outputs -@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, - ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING +) class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -723,9 +737,8 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): def __init__(self, config, *inputs, **kwargs): super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs) - self.albert = TFAlbertModel(config, name='albert') - self.predictions = TFAlbertMLMHead( - config, self.albert.embeddings, name='predictions') + self.albert = TFAlbertModel(config, name="albert") + self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") def get_output_embeddings(self): return self.albert.embeddings @@ -734,8 +747,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): outputs = self.albert(inputs, **kwargs) sequence_output = outputs[0] - prediction_scores = self.predictions( - sequence_output, training=kwargs.get('training', False)) + prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False)) # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] @@ -743,9 +755,12 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): return outputs # prediction_scores, (hidden_states), (attentions) -@add_start_docstrings("""Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING) + ALBERT_START_DOCSTRING, + ALBERT_INPUTS_DOCSTRING, +) class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -771,24 +786,25 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.albert = TFAlbertModel(config, name='albert') + self.albert = TFAlbertModel(config, name="albert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): outputs = self.albert(inputs, **kwargs) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) + pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - return outputs # logits, (hidden_states), (attentions) \ No newline at end of file + return outputs # logits, (hidden_states), (attentions) diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index 031ffea17e..24a7338d41 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -18,24 +18,70 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging -from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig, - GPT2Config, OpenAIGPTConfig, RobertaConfig, - TransfoXLConfig, XLMConfig, XLNetConfig) +from .configuration_auto import ( + BertConfig, + CTRLConfig, + DistilBertConfig, + GPT2Config, + OpenAIGPTConfig, + RobertaConfig, + TransfoXLConfig, + XLMConfig, + XLNetConfig, +) -from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \ - TFBertForQuestionAnswering, TFBertForTokenClassification, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_bert import ( + TFBertModel, + TFBertForMaskedLM, + TFBertForSequenceClassification, + TFBertForQuestionAnswering, + TFBertForTokenClassification, + TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP, +) from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \ - TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, \ - TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, \ - TFRobertaForTokenClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_transfo_xl import ( + TFTransfoXLModel, + TFTransfoXLLMHeadModel, + TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_tf_xlnet import ( + TFXLNetModel, + TFXLNetLMHeadModel, + TFXLNetForSequenceClassification, + TFXLNetForQuestionAnsweringSimple, + TFXLNetForTokenClassification, + TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_tf_xlm import ( + TFXLMModel, + TFXLMWithLMHeadModel, + TFXLMForSequenceClassification, + TFXLMForQuestionAnsweringSimple, + TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_tf_roberta import ( + TFRobertaModel, + TFRobertaForMaskedLM, + TFRobertaForSequenceClassification, + TFRobertaForTokenClassification, + TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, +) +from .modeling_tf_distilbert import ( + TFDistilBertModel, + TFDistilBertForQuestionAnswering, + TFDistilBertForMaskedLM, + TFDistilBertForSequenceClassification, + TFDistilBertForTokenClassification, + TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, +) from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_albert import ( + TFAlbertModel, + TFAlbertForMaskedLM, + TFAlbertForSequenceClassification, + TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, +) from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP from .file_utils import add_start_docstrings @@ -43,7 +89,8 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) +TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict( + (key, value) for pretrained_map in [ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, @@ -56,8 +103,9 @@ TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP, - ] - for key, value, in pretrained_map.items()) + ] + for key, value, in pretrained_map.items() +) class TFAutoModel(object): @@ -85,10 +133,13 @@ class TFAutoModel(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("TFAutoModel is designed to be instantiated " + raise EnvironmentError( + "TFAutoModel is designed to be instantiated " "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or " - "`TFAutoModel.from_config(config)` methods.") + "`TFAutoModel.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -209,32 +260,34 @@ class TFAutoModel(object): model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ - if 't5' in pretrained_model_name_or_path: + if "t5" in pretrained_model_name_or_path: return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: + elif "distilbert" in pretrained_model_name_or_path: return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'albert' in pretrained_model_name_or_path: + elif "albert" in pretrained_model_name_or_path: return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: + elif "roberta" in pretrained_model_name_or_path: return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return TFBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'openai-gpt' in pretrained_model_name_or_path: + elif "openai-gpt" in pretrained_model_name_or_path: return TFOpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'gpt2' in pretrained_model_name_or_path: + elif "gpt2" in pretrained_model_name_or_path: return TFGPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'transfo-xl' in pretrained_model_name_or_path: + elif "transfo-xl" in pretrained_model_name_or_path: return TFTransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return TFXLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return TFXLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'ctrl' in pretrained_model_name_or_path: + elif "ctrl" in pretrained_model_name_or_path: return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path) + ) class TFAutoModelWithLMHead(object): @@ -262,10 +315,13 @@ class TFAutoModelWithLMHead(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated " + raise EnvironmentError( + "TFAutoModelWithLMHead is designed to be instantiated " "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " - "`TFAutoModelWithLMHead.from_config(config)` methods.") + "`TFAutoModelWithLMHead.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -390,32 +446,34 @@ class TFAutoModelWithLMHead(object): model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ - if 't5' in pretrained_model_name_or_path: + if "t5" in pretrained_model_name_or_path: return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: + elif "distilbert" in pretrained_model_name_or_path: return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'albert' in pretrained_model_name_or_path: + elif "albert" in pretrained_model_name_or_path: return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: + elif "roberta" in pretrained_model_name_or_path: return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return TFBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'openai-gpt' in pretrained_model_name_or_path: + elif "openai-gpt" in pretrained_model_name_or_path: return TFOpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'gpt2' in pretrained_model_name_or_path: + elif "gpt2" in pretrained_model_name_or_path: return TFGPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'transfo-xl' in pretrained_model_name_or_path: + elif "transfo-xl" in pretrained_model_name_or_path: return TFTransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return TFXLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return TFXLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'ctrl' in pretrained_model_name_or_path: + elif "ctrl" in pretrained_model_name_or_path: return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path) + ) class TFAutoModelForSequenceClassification(object): @@ -438,10 +496,13 @@ class TFAutoModelForSequenceClassification(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("TFAutoModelForSequenceClassification is designed to be instantiated " + raise EnvironmentError( + "TFAutoModelForSequenceClassification is designed to be instantiated " "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " - "`TFAutoModelForSequenceClassification.from_config(config)` methods.") + "`TFAutoModelForSequenceClassification.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -552,21 +613,33 @@ class TFAutoModelForSequenceClassification(object): model = TFAutoModelForSequenceClassification.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: - return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'albert' in pretrained_model_name_or_path: - return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: - return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: - return TFBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: - return TFXLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + if "distilbert" in pretrained_model_name_or_path: + return TFDistilBertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "albert" in pretrained_model_name_or_path: + return TFAlbertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "roberta" in pretrained_model_name_or_path: + return TFRobertaForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "bert" in pretrained_model_name_or_path: + return TFBertForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "xlnet" in pretrained_model_name_or_path: + return TFXLNetForSequenceClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "xlm" in pretrained_model_name_or_path: return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path) + ) class TFAutoModelForQuestionAnswering(object): @@ -588,10 +661,13 @@ class TFAutoModelForQuestionAnswering(object): This class cannot be instantiated using `__init__()` (throws an error). """ + def __init__(self): - raise EnvironmentError("TFAutoModelForQuestionAnswering is designed to be instantiated " + raise EnvironmentError( + "TFAutoModelForQuestionAnswering is designed to be instantiated " "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " - "`TFAutoModelForQuestionAnswering.from_config(config)` methods.") + "`TFAutoModelForQuestionAnswering.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -698,24 +774,34 @@ class TFAutoModelForQuestionAnswering(object): model = TFAutoModelForQuestionAnswering.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: - return TFDistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + if "distilbert" in pretrained_model_name_or_path: + return TFDistilBertForQuestionAnswering.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "bert" in pretrained_model_name_or_path: return TFBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: - return TFXLNetForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: - return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif "xlnet" in pretrained_model_name_or_path: + return TFXLNetForQuestionAnsweringSimple.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "xlm" in pretrained_model_name_or_path: + return TFXLMForQuestionAnsweringSimple.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path) + ) class TFAutoModelForTokenClassification: def __init__(self): - raise EnvironmentError("TFAutoModelForTokenClassification is designed to be instantiated " - "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " - "`AutoModelForTokenClassification.from_config(config)` methods.") + raise EnvironmentError( + "TFAutoModelForTokenClassification is designed to be instantiated " + "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelForTokenClassification.from_config(config)` methods." + ) @classmethod def from_config(cls, config): @@ -815,14 +901,20 @@ class TFAutoModelForTokenClassification: model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 'bert' in pretrained_model_name_or_path: + if "bert" in pretrained_model_name_or_path: return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: - return TFDistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: - return TFRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif "distilbert" in pretrained_model_name_or_path: + return TFDistilBertForTokenClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + elif "roberta" in pretrained_model_name_or_path: + return TFRobertaForTokenClassification.from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path) + ) diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 9caad53a5f..bcb83d5df7 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -35,25 +35,25 @@ logger = logging.getLogger(__name__) TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5", - 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5", - 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5", - 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5", - 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5", - 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5", - 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5", - 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5", - 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5", - 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5", - 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5", + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5", + "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5", + "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5", + "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5", + "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5", + "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5", + "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5", + "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5", } @@ -67,6 +67,7 @@ def gelu(x): cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf + def gelu_new(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. @@ -76,41 +77,48 @@ def gelu_new(x): Returns: `x` with the GELU activation applied. """ - cdf = 0.5 * (1.0 + tf.tanh( - (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf + def swish(x): return x * tf.sigmoid(x) -ACT2FN = {"gelu": tf.keras.layers.Activation(gelu), - "relu": tf.keras.activations.relu, - "swish": tf.keras.layers.Activation(swish), - "gelu_new": tf.keras.layers.Activation(gelu_new)} +ACT2FN = { + "gelu": tf.keras.layers.Activation(gelu), + "relu": tf.keras.activations.relu, + "swish": tf.keras.layers.Activation(swish), + "gelu_new": tf.keras.layers.Activation(gelu_new), +} class TFBertEmbeddings(tf.keras.layers.Layer): """Construct the embeddings from word, position and token_type embeddings. """ + def __init__(self, config, **kwargs): super(TFBertEmbeddings, self).__init__(**kwargs) self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size self.initializer_range = config.initializer_range - self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), - name='position_embeddings') - self.token_type_embeddings = tf.keras.layers.Embedding(config.type_vocab_size, - config.hidden_size, - embeddings_initializer=get_initializer(self.initializer_range), - name='token_type_embeddings') + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + config.hidden_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="position_embeddings", + ) + self.token_type_embeddings = tf.keras.layers.Embedding( + config.type_vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="token_type_embeddings", + ) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm') + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def build(self, input_shape): @@ -121,7 +129,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer): self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range)) + initializer=get_initializer(self.initializer_range), + ) super(TFBertEmbeddings, self).build(input_shape) def call(self, inputs, mode="embedding", training=False): @@ -193,7 +202,8 @@ class TFBertSelfAttention(tf.keras.layers.Layer): if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads @@ -201,15 +211,15 @@ class TFBertSelfAttention(tf.keras.layers.Layer): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense(self.all_head_size, - kernel_initializer=get_initializer(config.initializer_range), - name='query') - self.key = tf.keras.layers.Dense(self.all_head_size, - kernel_initializer=get_initializer(config.initializer_range), - name='key') - self.value = tf.keras.layers.Dense(self.all_head_size, - kernel_initializer=get_initializer(config.initializer_range), - name='value') + self.query = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) @@ -230,8 +240,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer): value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k) - dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores + attention_scores = tf.matmul( + query_layer, key_layer, transpose_b=True + ) # (batch size, num_heads, seq_len_q, seq_len_k) + dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: @@ -252,8 +264,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer): context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape(context_layer, - (batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size) + context_layer = tf.reshape( + context_layer, (batch_size, -1, self.all_head_size) + ) # (batch_size, seq_len_q, all_head_size) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) return outputs @@ -262,10 +275,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertSelfOutput, self).__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer(config.initializer_range), - name='dense') - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm') + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): @@ -280,8 +293,8 @@ class TFBertSelfOutput(tf.keras.layers.Layer): class TFBertAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertAttention, self).__init__(**kwargs) - self.self_attention = TFBertSelfAttention(config, name='self') - self.dense_output = TFBertSelfOutput(config, name='output') + self.self_attention = TFBertSelfAttention(config, name="self") + self.dense_output = TFBertSelfOutput(config, name="output") def prune_heads(self, heads): raise NotImplementedError @@ -298,9 +311,9 @@ class TFBertAttention(tf.keras.layers.Layer): class TFBertIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertIntermediate, self).__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.intermediate_size, - kernel_initializer=get_initializer(config.initializer_range), - name='dense') + self.dense = tf.keras.layers.Dense( + config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: @@ -315,10 +328,10 @@ class TFBertIntermediate(tf.keras.layers.Layer): class TFBertOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertOutput, self).__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer(config.initializer_range), - name='dense') - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm') + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) def call(self, inputs, training=False): @@ -333,9 +346,9 @@ class TFBertOutput(tf.keras.layers.Layer): class TFBertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertLayer, self).__init__(**kwargs) - self.attention = TFBertAttention(config, name='attention') - self.intermediate = TFBertIntermediate(config, name='intermediate') - self.bert_output = TFBertOutput(config, name='output') + self.attention = TFBertAttention(config, name="attention") + self.intermediate = TFBertIntermediate(config, name="intermediate") + self.bert_output = TFBertOutput(config, name="output") def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs @@ -353,7 +366,7 @@ class TFBertEncoder(tf.keras.layers.Layer): super(TFBertEncoder, self).__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.layer = [TFBertLayer(config, name='layer_._{}'.format(i)) for i in range(config.num_hidden_layers)] + self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs @@ -385,10 +398,12 @@ class TFBertEncoder(tf.keras.layers.Layer): class TFBertPooler(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertPooler, self).__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer(config.initializer_range), - activation='tanh', - name='dense') + self.dense = tf.keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) def call(self, hidden_states): # We "pool" the model by simply taking the hidden state corresponding @@ -401,14 +416,14 @@ class TFBertPooler(tf.keras.layers.Layer): class TFBertPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertPredictionHeadTransform, self).__init__(**kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer(config.initializer_range), - name='dense') + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act - self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='LayerNorm') + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") def call(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -421,17 +436,14 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super(TFBertLMPredictionHead, self).__init__(**kwargs) self.vocab_size = config.vocab_size - self.transform = TFBertPredictionHeadTransform(config, name='transform') + self.transform = TFBertPredictionHeadTransform(config, name="transform") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.input_embeddings = input_embeddings def build(self, input_shape): - self.bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='bias') + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super(TFBertLMPredictionHead, self).build(input_shape) def call(self, hidden_states): @@ -444,7 +456,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): class TFBertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): super(TFBertMLMHead, self).__init__(**kwargs) - self.predictions = TFBertLMPredictionHead(config, input_embeddings, name='predictions') + self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") def call(self, sequence_output): prediction_scores = self.predictions(sequence_output) @@ -454,9 +466,9 @@ class TFBertMLMHead(tf.keras.layers.Layer): class TFBertNSPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFBertNSPHead, self).__init__(**kwargs) - self.seq_relationship = tf.keras.layers.Dense(2, - kernel_initializer=get_initializer(config.initializer_range), - name='seq_relationship') + self.seq_relationship = tf.keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" + ) def call(self, pooled_output): seq_relationship_score = self.seq_relationship(pooled_output) @@ -468,9 +480,9 @@ class TFBertMainLayer(tf.keras.layers.Layer): super(TFBertMainLayer, self).__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers - self.embeddings = TFBertEmbeddings(config, name='embeddings') - self.encoder = TFBertEncoder(config, name='encoder') - self.pooler = TFBertPooler(config, name='pooler') + self.embeddings = TFBertEmbeddings(config, name="embeddings") + self.encoder = TFBertEncoder(config, name="encoder") + self.pooler = TFBertPooler(config, name="pooler") def get_input_embeddings(self): return self.embeddings @@ -485,7 +497,16 @@ class TFBertMainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -495,12 +516,12 @@ class TFBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs @@ -552,7 +573,9 @@ class TFBertMainLayer(tf.keras.layers.Layer): sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + outputs = (sequence_output, pooled_output,) + encoder_outputs[ + 1: + ] # add hidden_states and attentions if they are here return outputs # sequence_output, pooled_output, (hidden_states), (attentions) @@ -560,6 +583,7 @@ class TFBertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = BertConfig pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "bert" @@ -648,8 +672,12 @@ BERT_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class TFBertModel(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -682,18 +710,22 @@ class TFBertModel(TFBertPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFBertModel, self).__init__(config, *inputs, **kwargs) - self.bert = TFBertMainLayer(config, name='bert') + self.bert = TFBertMainLayer(config, name="bert") def call(self, inputs, **kwargs): outputs = self.bert(inputs, **kwargs) return outputs -@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training: +@add_start_docstrings( + """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next sentence prediction (classification)` head. """, - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class TFBertForPreTraining(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -721,12 +753,13 @@ class TFBertForPreTraining(TFBertPreTrainedModel): prediction_scores, seq_relationship_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs) - self.bert = TFBertMainLayer(config, name='bert') - self.nsp = TFBertNSPHead(config, name='nsp___cls') - self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls') + self.bert = TFBertMainLayer(config, name="bert") + self.nsp = TFBertNSPHead(config, name="nsp___cls") + self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") def get_output_embeddings(self): return self.bert.embeddings @@ -735,16 +768,19 @@ class TFBertForPreTraining(TFBertPreTrainedModel): outputs = self.bert(inputs, **kwargs) sequence_output, pooled_output = outputs[:2] - prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False)) + prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) seq_relationship_score = self.nsp(pooled_output) - outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + outputs = (prediction_scores, seq_relationship_score,) + outputs[ + 2: + ] # add hidden states and attention if they are here return outputs # prediction_scores, seq_relationship_score, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING +) class TFBertForMaskedLM(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -770,11 +806,12 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): prediction_scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs) - self.bert = TFBertMainLayer(config, name='bert') - self.mlm = TFBertMLMHead(config, self.bert.embeddings, name='mlm___cls') + self.bert = TFBertMainLayer(config, name="bert") + self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") def get_output_embeddings(self): return self.bert.embeddings @@ -783,15 +820,18 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False)) + prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here return outputs # prediction_scores, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """, - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class TFBertForNextSentencePrediction(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -817,11 +857,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): seq_relationship_scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs) - self.bert = TFBertMainLayer(config, name='bert') - self.nsp = TFBertNSPHead(config, name='nsp___cls') + self.bert = TFBertMainLayer(config, name="bert") + self.nsp = TFBertNSPHead(config, name="nsp___cls") def call(self, inputs, **kwargs): outputs = self.bert(inputs, **kwargs) @@ -834,9 +875,12 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): return outputs # seq_relationship_score, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class TFBertForSequenceClassification(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -862,22 +906,23 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.bert = TFBertMainLayer(config, name='bert') + self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): outputs = self.bert(inputs, **kwargs) pooled_output = outputs[1] - pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) + pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) logits = self.classifier(pooled_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here @@ -885,9 +930,12 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): return outputs # logits, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of +@add_start_docstrings( + """Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class TFBertForMultipleChoice(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -915,16 +963,26 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): classification_scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs) - self.bert = TFBertMainLayer(config, name='bert') + self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(1, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -934,12 +992,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs @@ -956,7 +1014,14 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds] + flat_inputs = [ + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + ] outputs = self.bert(flat_inputs, training=training) @@ -971,9 +1036,12 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): return outputs # reshaped_logits, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class TFBertForTokenClassification(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -999,22 +1067,23 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.bert = TFBertMainLayer(config, name='bert') + self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): outputs = self.bert(inputs, **kwargs) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False)) + sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here @@ -1022,9 +1091,12 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): return outputs # scores, (hidden_states), (attentions) -@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) + BERT_START_DOCSTRING, + BERT_INPUTS_DOCSTRING, +) class TFBertForQuestionAnswering(TFBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -1052,14 +1124,15 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): start_scores, end_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.bert = TFBertMainLayer(config, name='bert') - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='qa_outputs') + self.bert = TFBertMainLayer(config, name="bert") + self.qa_outputs = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) def call(self, inputs, **kwargs): outputs = self.bert(inputs, **kwargs) diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py index 0f9b34924f..3aba94a50d 100644 --- a/transformers/modeling_tf_ctrl.py +++ b/transformers/modeling_tf_ctrl.py @@ -32,15 +32,15 @@ logger = logging.getLogger(__name__) TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://s3.amazonaws.com/models.huggingface.co/bert/ctrl-tf_model.h5"} + def angle_defn(pos, i, d_model_size): - angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model_size)) + angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model_size)) return pos * angle_rates + def positional_encoding(position, d_model_size): # create the sinusoidal pattern for the positional encoding - angle_rads = angle_defn(np.arange(position)[:, np.newaxis], - np.arange(d_model_size)[np.newaxis, :], - d_model_size) + angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size) sines = np.sin(angle_rads[:, 0::2]) cosines = np.cos(angle_rads[:, 1::2]) @@ -49,27 +49,28 @@ def positional_encoding(position, d_model_size): pos_encoding = tf.cast(np.concatenate([sines, cosines], axis=-1), dtype=tf.float32) return pos_encoding + def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): # calculate attention matmul_qk = tf.matmul(q, k, transpose_b=True) - + dk = tf.cast(shape_list(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) if mask is not None: - scaled_attention_logits += (mask * -1e4) + scaled_attention_logits += mask * -1e4 if attention_mask is not None: # Apply the attention mask scaled_attention_logits = scaled_attention_logits + attention_mask - attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) + attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # Mask heads if we want to if head_mask is not None: attention_weights = attention_weights * head_mask - output = tf.matmul(attention_weights, v) + output = tf.matmul(attention_weights, v) return output, attention_weights @@ -83,11 +84,11 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): self.depth = int(d_model_size / self.num_heads) - self.Wq = tf.keras.layers.Dense(d_model_size, name='Wq') - self.Wk = tf.keras.layers.Dense(d_model_size, name='Wk') - self.Wv = tf.keras.layers.Dense(d_model_size, name='Wv') + self.Wq = tf.keras.layers.Dense(d_model_size, name="Wq") + self.Wk = tf.keras.layers.Dense(d_model_size, name="Wk") + self.Wv = tf.keras.layers.Dense(d_model_size, name="Wv") - self.dense = tf.keras.layers.Dense(d_model_size, name='dense') + self.dense = tf.keras.layers.Dense(d_model_size, name="dense") def split_into_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) @@ -113,7 +114,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3]) attn = output[1] - original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size)) + original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size)) output = self.dense(original_size_attention) outputs = (output, present) @@ -122,22 +123,22 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): return outputs - def point_wise_feed_forward_network(d_model_size, dff, name=""): - return tf.keras.Sequential([ - tf.keras.layers.Dense(dff, activation='relu', name="0"), - tf.keras.layers.Dense(d_model_size, name="2") - ], name="ffn") + return tf.keras.Sequential( + [tf.keras.layers.Dense(dff, activation="relu", name="0"), tf.keras.layers.Dense(d_model_size, name="2")], + name="ffn", + ) class TFEncoderLayer(tf.keras.layers.Layer): - def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs): + def __init__( + self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs + ): super(TFEncoderLayer, self).__init__(**kwargs) - self.multi_head_attention = TFMultiHeadAttention(d_model_size, - num_heads, - output_attentions, - name="multi_head_attention") + self.multi_head_attention = TFMultiHeadAttention( + d_model_size, num_heads, output_attentions, name="multi_head_attention" + ) self.ffn = point_wise_feed_forward_network(d_model_size, dff, name="ffn") self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1") @@ -149,8 +150,9 @@ class TFEncoderLayer(tf.keras.layers.Layer): def call(self, inputs, training=False): x, mask, layer_past, attention_mask, head_mask = inputs normed = self.layernorm1(x) - attn_outputs = self.multi_head_attention([normed, normed, normed, mask, layer_past, - attention_mask, head_mask], training=training) + attn_outputs = self.multi_head_attention( + [normed, normed, normed, mask, layer_past, attention_mask, head_mask], training=training + ) attn_output = attn_outputs[0] attn_output = self.dropout1(attn_output, training=training) out1 = x + attn_output @@ -176,20 +178,23 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size) - - self.w = TFSharedEmbeddings(config.vocab_size, - config.n_embd, - initializer_range=config.initializer_range, - name="w") + self.w = TFSharedEmbeddings( + config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="w" + ) self.dropout = tf.keras.layers.Dropout(config.embd_pdrop) - self.h = [TFEncoderLayer(config.n_embd, - config.n_head, - config.dff, - config.resid_pdrop, - config.layer_norm_epsilon, - config.output_attentions, - name='h_._{}'.format(i)) for i in range(config.n_layer)] + self.h = [ + TFEncoderLayer( + config.n_embd, + config.n_head, + config.dff, + config.resid_pdrop, + config.layer_norm_epsilon, + config.output_attentions, + name="h_._{}".format(i), + ) + for i in range(config.n_layer) + ] self.layernorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm") def get_input_embeddings(self): @@ -204,7 +209,17 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): + def call( + self, + inputs, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past @@ -215,13 +230,13 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - past = inputs.get('past', past) - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + past = inputs.get("past", past) + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs @@ -276,14 +291,14 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) - token_type_embeds = self.w(token_type_ids, mode='embedding') + token_type_embeds = self.w(token_type_ids, mode="embedding") token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, tf.float32)) else: token_type_embeds = 0 position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: - inputs_embeds = self.w(input_ids, mode='embedding') + inputs_embeds = self.w(input_ids, mode="embedding") seq_len = input_shape[-1] mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) @@ -333,6 +348,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = CTRLConfig pretrained_model_archive_map = TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -392,8 +408,12 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs: than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", - CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", + CTRL_START_DOCSTRING, + CTRL_INPUTS_DOCSTRING, +) class TFCTRLModel(TFCTRLPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -423,9 +443,10 @@ class TFCTRLModel(TFCTRLPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFCTRLModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFCTRLMainLayer(config, name='transformer') + self.transformer = TFCTRLMainLayer(config, name="transformer") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) @@ -442,10 +463,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer): self.input_embeddings = input_embeddings def build(self, input_shape): - self.bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='bias') + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super(TFCTRLLMHead, self).build(input_shape) def call(self, hidden_states): @@ -454,8 +472,12 @@ class TFCTRLLMHead(tf.keras.layers.Layer): return hidden_states -@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top -(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING) +@add_start_docstrings( + """The CTRL Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, + CTRL_START_DOCSTRING, + CTRL_INPUTS_DOCSTRING, +) class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -486,9 +508,10 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFCTRLMainLayer(config, name='transformer') + self.transformer = TFCTRLMainLayer(config, name="transformer") self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head") diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py index afd88d7ebf..e9e89d2e73 100644 --- a/transformers/modeling_tf_distilbert.py +++ b/transformers/modeling_tf_distilbert.py @@ -36,9 +36,9 @@ logger = logging.getLogger(__name__) TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5", - 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5", - 'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5", + "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-tf_model.h5", + "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-tf_model.h5", + "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-multilingual-cased-tf_model.h5", } @@ -53,6 +53,7 @@ def gelu(x): cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) return x * cdf + def gelu_new(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. @@ -62,24 +63,25 @@ def gelu_new(x): Returns: `x` with the GELU activation applied. """ - cdf = 0.5 * (1.0 + tf.tanh( - (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf + class TFEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFEmbeddings, self).__init__(**kwargs) self.vocab_size = config.vocab_size self.dim = config.dim self.initializer_range = config.initializer_range - self.word_embeddings = TFSharedEmbeddings(config.vocab_size, - config.dim, - initializer_range=config.initializer_range, - name='word_embeddings') # padding_idx=0) - self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, - config.dim, - embeddings_initializer=get_initializer(config.initializer_range), - name='position_embeddings') + self.word_embeddings = TFSharedEmbeddings( + config.vocab_size, config.dim, initializer_range=config.initializer_range, name="word_embeddings" + ) # padding_idx=0) + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + config.dim, + embeddings_initializer=get_initializer(config.initializer_range), + name="position_embeddings", + ) if config.sinusoidal_pos_embds: raise NotImplementedError @@ -92,9 +94,8 @@ class TFEmbeddings(tf.keras.layers.Layer): # Create and initialize weights. The random normal initializer was chosen # arbitrarily, and works well. self.word_embeddings = self.add_weight( - "weight", - shape=[self.vocab_size, self.dim], - initializer=get_initializer(self.initializer_range)) + "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range) + ) super(TFEmbeddings, self).build(input_shape) def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): @@ -149,9 +150,9 @@ class TFEmbeddings(tf.keras.layers.Layer): inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) - embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim) - embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) - embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim) + embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim) + embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) + embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim) return embeddings def _linear(self, inputs): @@ -181,18 +182,18 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): assert self.dim % self.n_heads == 0 - self.q_lin = tf.keras.layers.Dense(config.dim, - kernel_initializer=get_initializer(config.initializer_range), - name="q_lin") - self.k_lin = tf.keras.layers.Dense(config.dim, - kernel_initializer=get_initializer(config.initializer_range), - name="k_lin") - self.v_lin = tf.keras.layers.Dense(config.dim, - kernel_initializer=get_initializer(config.initializer_range), - name="v_lin") - self.out_lin = tf.keras.layers.Dense(config.dim, - kernel_initializer=get_initializer(config.initializer_range), - name="out_lin") + self.q_lin = tf.keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="q_lin" + ) + self.k_lin = tf.keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="k_lin" + ) + self.v_lin = tf.keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="v_lin" + ) + self.out_lin = tf.keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="out_lin" + ) self.pruned_heads = set() @@ -233,44 +234,49 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): """ group heads """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) - q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) - k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) - v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) - q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) - scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length) - mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, q_length, k_length) + mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length) scores = scores - 1e30 * (1.0 - mask) - weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) - weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask - context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) - context = unshape(context) # (bs, q_length, dim) - context = self.out_lin(context) # (bs, q_length, dim) + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, q_length, dim) + context = self.out_lin(context) # (bs, q_length, dim) if self.output_attentions: return (context, weights) else: return (context,) + class TFFFN(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFFFN, self).__init__(**kwargs) self.dropout = tf.keras.layers.Dropout(config.dropout) - self.lin1 = tf.keras.layers.Dense(config.hidden_dim, - kernel_initializer=get_initializer(config.initializer_range), - name="lin1") - self.lin2 = tf.keras.layers.Dense(config.dim, - kernel_initializer=get_initializer(config.initializer_range), - name="lin2") - assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation) - self.activation = tf.keras.layers.Activation(gelu) if config.activation=='gelu' else tf.keras.activations.relu + self.lin1 = tf.keras.layers.Dense( + config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" + ) + self.lin2 = tf.keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="lin2" + ) + assert config.activation in ["relu", "gelu"], "activation ({}) must be in ['relu', 'gelu']".format( + config.activation + ) + self.activation = ( + tf.keras.layers.Activation(gelu) if config.activation == "gelu" else tf.keras.activations.relu + ) def call(self, input, training=False): x = self.lin1(input) @@ -318,14 +324,14 @@ class TFTransformerBlock(tf.keras.layers.Layer): # Self-Attention sa_output = self.attention([x, x, x, attn_mask, head_mask], training=training) if self.output_attentions: - sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) - else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples + sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) + else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples # assert type(sa_output) == tuple sa_output = sa_output[0] - sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) + sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) # Feed Forward Network - ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim) + ffn_output = self.ffn(sa_output, training=training) # (bs, seq_length, dim) ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) output = (ffn_output,) @@ -341,8 +347,7 @@ class TFTransformer(tf.keras.layers.Layer): self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.layer = [TFTransformerBlock(config, name='layer_._{}'.format(i)) - for i in range(config.n_layers)] + self.layer = [TFTransformerBlock(config, name="layer_._{}".format(i)) for i in range(config.n_layers)] def call(self, inputs, training=False): """ @@ -401,8 +406,8 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): super(TFDistilBertMainLayer, self).__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers - self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings - self.transformer = TFTransformer(config, name="transformer") # Encoder + self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings + self.transformer = TFTransformer(config, name="transformer") # Encoder def get_input_embeddings(self): return self.embeddings @@ -421,10 +426,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs @@ -439,7 +444,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): raise ValueError("You have to specify either input_ids or inputs_embeds") if attention_mask is None: - attention_mask = tf.ones(input_shape) # (bs, seq_length) + attention_mask = tf.ones(input_shape) # (bs, seq_length) attention_mask = tf.cast(attention_mask, dtype=tf.float32) # Prepare head mask if needed @@ -452,10 +457,10 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): else: head_mask = [None] * self.num_hidden_layers - embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) + embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training) - return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) + return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) ### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ### @@ -463,6 +468,7 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ + config_class = DistilBertConfig pretrained_model_archive_map = TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "distilbert" @@ -534,8 +540,12 @@ DISTILBERT_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class TFDistilBertModel(TFDistilBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -561,9 +571,10 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs) - self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings def call(self, inputs, **kwargs): outputs = self.distilbert(inputs, **kwargs) @@ -580,10 +591,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): self.input_embeddings = input_embeddings def build(self, input_shape): - self.bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='bias') + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super(TFDistilBertLMHead, self).build(input_shape) def call(self, hidden_states): @@ -592,8 +600,11 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): return hidden_states -@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """, - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +@add_start_docstrings( + """DistilBert Model with a `masked language modeling` head on top. """, + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -619,6 +630,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): prediction_scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs) self.output_attentions = config.output_attentions @@ -626,9 +638,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): self.vocab_size = config.vocab_size self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.vocab_transform = tf.keras.layers.Dense(config.dim, - kernel_initializer=get_initializer(config.initializer_range), - name="vocab_transform") + self.vocab_transform = tf.keras.layers.Dense( + config.dim, kernel_initializer=get_initializer(config.initializer_range), name="vocab_transform" + ) self.act = tf.keras.layers.Activation(gelu) self.vocab_layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12, name="vocab_layer_norm") self.vocab_projector = TFDistilBertLMHead(config, self.distilbert.embeddings, name="vocab_projector") @@ -639,9 +651,9 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): def call(self, inputs, **kwargs): distilbert_output = self.distilbert(inputs, **kwargs) - hidden_states = distilbert_output[0] # (bs, seq_length, dim) - prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) - prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim) + hidden_states = distilbert_output[0] # (bs, seq_length, dim) + prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) + prediction_logits = self.act(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) @@ -649,9 +661,12 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): return outputs # logits, (hidden_states), (attentions) -@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -677,36 +692,42 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.pre_classifier = tf.keras.layers.Dense(config.dim, - kernel_initializer=get_initializer(config.initializer_range), - activation='relu', - name="pre_classifier") - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name="classifier") + self.pre_classifier = tf.keras.layers.Dense( + config.dim, + kernel_initializer=get_initializer(config.initializer_range), + activation="relu", + name="pre_classifier", + ) + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) self.dropout = tf.keras.layers.Dropout(config.seq_classif_dropout) def call(self, inputs, **kwargs): distilbert_output = self.distilbert(inputs, **kwargs) - hidden_state = distilbert_output[0] # (bs, seq_len, dim) - pooled_output = hidden_state[:, 0] # (bs, dim) - pooled_output = self.pre_classifier(pooled_output) # (bs, dim) - pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) # (bs, dim) - logits = self.classifier(pooled_output) # (bs, dim) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False)) # (bs, dim) + logits = self.classifier(pooled_output) # (bs, dim) outputs = (logits,) + distilbert_output[1:] return outputs # logits, (hidden_states), (attentions) -@add_start_docstrings("""DistilBert Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -728,22 +749,23 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): outputs = model(input_ids) scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.distilbert = TFDistilBertMainLayer(config, name='distilbert') + self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.dropout = tf.keras.layers.Dropout(config.dropout) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): outputs = self.distilbert(inputs, **kwargs) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False)) + sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here @@ -751,9 +773,12 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): return outputs # scores, (hidden_states), (attentions) -@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) + DISTILBERT_START_DOCSTRING, + DISTILBERT_INPUTS_DOCSTRING, +) class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -781,22 +806,23 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): start_scores, end_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='qa_outputs') + self.qa_outputs = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) assert config.num_labels == 2 self.dropout = tf.keras.layers.Dropout(config.qa_dropout) def call(self, inputs, **kwargs): distilbert_output = self.distilbert(inputs, **kwargs) - hidden_states = distilbert_output[0] # (bs, max_query_len, dim) - hidden_states = self.dropout(hidden_states, training=kwargs.get('training', False)) # (bs, max_query_len, dim) - logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) + hidden_states = self.dropout(hidden_states, training=kwargs.get("training", False)) # (bs, max_query_len, dim) + logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py index 718e8f6058..a4722fb343 100644 --- a/transformers/modeling_tf_gpt2.py +++ b/transformers/modeling_tf_gpt2.py @@ -28,17 +28,25 @@ from io import open import numpy as np import tensorflow as tf -from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, - TFSequenceSummary, shape_list, get_initializer) +from .modeling_tf_utils import ( + TFPreTrainedModel, + TFConv1D, + TFSharedEmbeddings, + TFSequenceSummary, + shape_list, + get_initializer, +) from .configuration_gpt2 import GPT2Config from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5", - "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5", - "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5", - "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",} +TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5", +} def gelu(x): @@ -50,8 +58,7 @@ def gelu(x): Returns: `x` with the GELU activation applied. """ - cdf = 0.5 * (1.0 + tf.tanh( - (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf @@ -68,8 +75,8 @@ class TFAttention(tf.keras.layers.Layer): self.split_size = n_state self.scale = scale - self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn') - self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj') + self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") + self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() @@ -82,7 +89,7 @@ class TFAttention(tf.keras.layers.Layer): """1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. """ - i = tf.range(nd)[:,None] + i = tf.range(nd)[:, None] j = tf.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) @@ -92,7 +99,7 @@ class TFAttention(tf.keras.layers.Layer): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: - dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores + dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. @@ -158,8 +165,8 @@ class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super(TFMLP, self).__init__(**kwargs) nx = config.n_embd - self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc') - self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj') + self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") + self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) @@ -174,10 +181,10 @@ class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super(TFBlock, self).__init__(**kwargs) nx = config.n_embd - self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1') - self.attn = TFAttention(nx, n_ctx, config, scale, name='attn') - self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2') - self.mlp = TFMLP(4 * nx, config, name='mlp') + self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") + self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") + self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") + self.mlp = TFMLP(4 * nx, config, name="mlp") def call(self, inputs, training=False): x, layer_past, attention_mask, head_mask = inputs @@ -204,20 +211,18 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): self.vocab_size = config.vocab_size self.n_embd = config.n_embd - self.wte = TFSharedEmbeddings(config.vocab_size, - config.hidden_size, - initializer_range=config.initializer_range, - name='wte') - self.wpe = tf.keras.layers.Embedding(config.n_positions, - config.n_embd, - embeddings_initializer=get_initializer(config.initializer_range), - name='wpe') + self.wte = TFSharedEmbeddings( + config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" + ) + self.wpe = tf.keras.layers.Embedding( + config.n_positions, + config.n_embd, + embeddings_initializer=get_initializer(config.initializer_range), + name="wpe", + ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) - self.h = [TFBlock(config.n_ctx, - config, - scale=True, - name='h_._{}'.format(i)) for i in range(config.n_layer)] - self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f') + self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] + self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") def get_input_embeddings(self): return self.wte @@ -231,7 +236,17 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): + def call( + self, + inputs, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past @@ -242,13 +257,13 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - past = inputs.get('past', past) - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + past = inputs.get("past", past) + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs @@ -304,11 +319,11 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: - inputs_embeds = self.wte(input_ids, mode='embedding') + inputs_embeds = self.wte(input_ids, mode="embedding") position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) - token_type_embeds = self.wte(token_type_ids, mode='embedding') + token_type_embeds = self.wte(token_type_ids, mode="embedding") else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds @@ -353,6 +368,7 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = GPT2Config pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -428,8 +444,12 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.", - GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.", + GPT2_START_DOCSTRING, + GPT2_INPUTS_DOCSTRING, +) class TFGPT2Model(TFGPT2PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -459,17 +479,22 @@ class TFGPT2Model(TFGPT2PreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFGPT2Model, self).__init__(config, *inputs, **kwargs) - self.transformer = TFGPT2MainLayer(config, name='transformer') + self.transformer = TFGPT2MainLayer(config, name="transformer") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) return outputs -@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top -(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) +@add_start_docstrings( + """The GPT2 Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, + GPT2_START_DOCSTRING, + GPT2_INPUTS_DOCSTRING, +) class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -500,9 +525,10 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFGPT2MainLayer(config, name='transformer') + self.transformer = TFGPT2MainLayer(config, name="transformer") def get_output_embeddings(self): return self.transformer.wte @@ -518,11 +544,15 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): return outputs # lm_logits, presents, (all hidden_states), (attentions) -@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification +@add_start_docstrings( + """The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). -""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) +""", + GPT2_START_DOCSTRING, + GPT2_INPUTS_DOCSTRING, +) class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): r""" **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``: @@ -572,16 +602,30 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): lm_prediction_scores, mc_prediction_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs) config.num_labels = 1 - self.transformer = TFGPT2MainLayer(config, name='transformer') - self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') + self.transformer = TFGPT2MainLayer(config, name="transformer") + self.multiple_choice_head = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="multiple_choice_head" + ) def get_output_embeddings(self): return self.transformer.wte - def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False): + def call( + self, + inputs, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + mc_token_ids=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past @@ -593,14 +637,14 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - past = inputs.get('past', past) - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) - mc_token_ids = inputs.get('mc_token_ids', mc_token_ids) + input_ids = inputs.get("input_ids") + past = inputs.get("past", past) + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs @@ -617,7 +661,15 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds] + flat_inputs = [ + flat_input_ids, + past, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + ] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py index 791c6dcc18..4720e3c5db 100644 --- a/transformers/modeling_tf_openai.py +++ b/transformers/modeling_tf_openai.py @@ -28,14 +28,22 @@ from io import open import numpy as np import tensorflow as tf -from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings, - TFSequenceSummary, shape_list, get_initializer) +from .modeling_tf_utils import ( + TFPreTrainedModel, + TFConv1D, + TFSharedEmbeddings, + TFSequenceSummary, + shape_list, + get_initializer, +) from .configuration_openai import OpenAIGPTConfig from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"} +TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = { + "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5" +} def gelu(x): @@ -47,8 +55,7 @@ def gelu(x): Returns: `x` with the GELU activation applied. """ - cdf = 0.5 * (1.0 + tf.tanh( - (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf @@ -56,9 +63,11 @@ def swish(x): return x * tf.math.sigmoid(x) -ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu), - "relu": tf.keras.activations.relu, - "swish": tf.keras.layers.Activation(swish)} +ACT_FNS = { + "gelu": tf.keras.layers.Activation(gelu), + "relu": tf.keras.activations.relu, + "swish": tf.keras.layers.Activation(swish), +} class TFAttention(tf.keras.layers.Layer): @@ -74,8 +83,8 @@ class TFAttention(tf.keras.layers.Layer): self.split_size = n_state self.scale = scale - self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn') - self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj') + self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn") + self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj") self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop) self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop) self.pruned_heads = set() @@ -88,7 +97,7 @@ class TFAttention(tf.keras.layers.Layer): """1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. """ - i = tf.range(nd)[:,None] + i = tf.range(nd)[:, None] j = tf.range(ns) m = i >= j - ns + nd return tf.cast(m, dtype) @@ -98,7 +107,7 @@ class TFAttention(tf.keras.layers.Layer): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: - dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores + dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. @@ -159,8 +168,8 @@ class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): super(TFMLP, self).__init__(**kwargs) nx = config.n_embd - self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc') - self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj') + self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") + self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") self.act = gelu self.dropout = tf.keras.layers.Dropout(config.resid_pdrop) @@ -175,10 +184,10 @@ class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super(TFBlock, self).__init__(**kwargs) nx = config.n_embd - self.attn = TFAttention(nx, n_ctx, config, scale, name='attn') - self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1') - self.mlp = TFMLP(4 * nx, config, name='mlp') - self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2') + self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") + self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") + self.mlp = TFMLP(4 * nx, config, name="mlp") + self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") def call(self, inputs, training=False): x, attention_mask, head_mask = inputs @@ -203,19 +212,17 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): self.vocab_size = config.vocab_size self.n_embd = config.n_embd - self.tokens_embed = TFSharedEmbeddings(config.vocab_size, - config.n_embd, - initializer_range=config.initializer_range, - name='tokens_embed') - self.positions_embed = tf.keras.layers.Embedding(config.n_positions, - config.n_embd, - embeddings_initializer=get_initializer(config.initializer_range), - name='positions_embed') + self.tokens_embed = TFSharedEmbeddings( + config.vocab_size, config.n_embd, initializer_range=config.initializer_range, name="tokens_embed" + ) + self.positions_embed = tf.keras.layers.Embedding( + config.n_positions, + config.n_embd, + embeddings_initializer=get_initializer(config.initializer_range), + name="positions_embed", + ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) - self.h = [TFBlock(config.n_ctx, - config, - scale=True, - name='h_._{}'.format(i)) for i in range(config.n_layer)] + self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] def get_input_embeddings(self): return self.tokens_embed @@ -229,7 +236,16 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -239,12 +255,12 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs @@ -295,11 +311,11 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) if inputs_embeds is None: - inputs_embeds = self.tokens_embed(input_ids, mode='embedding') + inputs_embeds = self.tokens_embed(input_ids, mode="embedding") position_embeds = self.positions_embed(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) - token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding') + token_type_embeds = self.tokens_embed(token_type_ids, mode="embedding") else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds @@ -338,6 +354,7 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = OpenAIGPTConfig pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -409,8 +426,12 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.", - OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.", + OPENAI_GPT_START_DOCSTRING, + OPENAI_GPT_INPUTS_DOCSTRING, +) class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -436,17 +457,22 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') + self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) return outputs -@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top -(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) +@add_start_docstrings( + """OpenAI GPT Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, + OPENAI_GPT_START_DOCSTRING, + OPENAI_GPT_INPUTS_DOCSTRING, +) class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -472,9 +498,10 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') + self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") def get_output_embeddings(self): return self.transformer.tokens_embed @@ -490,11 +517,15 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): return outputs # lm_logits, (all hidden_states), (attentions) -@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification +@add_start_docstrings( + """OpenAI GPT Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence). -""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) +""", + OPENAI_GPT_START_DOCSTRING, + OPENAI_GPT_INPUTS_DOCSTRING, +) class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): r""" **mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``: @@ -536,16 +567,29 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): lm_prediction_scores, mc_prediction_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs) config.num_labels = 1 - self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') - self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') + self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") + self.multiple_choice_head = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="multiple_choice_head" + ) def get_output_embeddings(self): return self.transformer.tokens_embed - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False): + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + mc_token_ids=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -556,13 +600,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) - mc_token_ids = inputs.get('mc_token_ids', mc_token_ids) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + mc_token_ids = inputs.get("mc_token_ids", mc_token_ids) assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs @@ -579,7 +623,14 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds] + flat_inputs = [ + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + ] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py index 190caff18d..d1073d23a3 100644 --- a/transformers/modeling_tf_pytorch_utils.py +++ b/transformers/modeling_tf_pytorch_utils.py @@ -15,8 +15,7 @@ # limitations under the License. """ PyTorch - TF 2.0 general utilities.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging import os @@ -25,7 +24,8 @@ import numpy logger = logging.getLogger(__name__) -def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''): + +def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=""): """ Convert a TF 2.0 model variable name in a pytorch model weight name. Conventions for TF2.0 scopes -> PyTorch attribute names conversions: @@ -36,26 +36,30 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove='') - pytorch model weight name - transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other """ - tf_name = tf_name.replace(':0', '') # device ids - tf_name = re.sub(r'/[^/]*___([^/]*)/', r'/\1/', tf_name) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) - tf_name = tf_name.replace('_._', '/') # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) - tf_name = re.sub(r'//+', '/', tf_name) # Remove empty levels at the end - tf_name = tf_name.split('/') # Convert from TF2.0 '/' separators to PyTorch '.' separators - tf_name = tf_name[1:] # Remove level zero + tf_name = tf_name.replace(":0", "") # device ids + tf_name = re.sub( + r"/[^/]*___([^/]*)/", r"/\1/", tf_name + ) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) + tf_name = tf_name.replace( + "_._", "/" + ) # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) + tf_name = re.sub(r"//+", "/", tf_name) # Remove empty levels at the end + tf_name = tf_name.split("/") # Convert from TF2.0 '/' separators to PyTorch '.' separators + tf_name = tf_name[1:] # Remove level zero # When should we transpose the weights - transpose = bool(tf_name[-1] == 'kernel' or 'emb_projs' in tf_name or 'out_projs' in tf_name) + transpose = bool(tf_name[-1] == "kernel" or "emb_projs" in tf_name or "out_projs" in tf_name) # Convert standard TF2.0 names in PyTorch names - if tf_name[-1] == 'kernel' or tf_name[-1] == 'embeddings' or tf_name[-1] == 'gamma': - tf_name[-1] = 'weight' - if tf_name[-1] == 'beta': - tf_name[-1] = 'bias' + if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma": + tf_name[-1] = "weight" + if tf_name[-1] == "beta": + tf_name[-1] = "bias" # Remove prefix if needed - tf_name = '.'.join(tf_name) + tf_name = ".".join(tf_name) if start_prefix_to_remove: - tf_name = tf_name.replace(start_prefix_to_remove, '', 1) + tf_name = tf_name.replace(start_prefix_to_remove, "", 1) return tf_name, transpose @@ -63,6 +67,7 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove='') ##################### ### PyTorch => TF 2.0 + def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False): """ Load pytorch checkpoints in a TF 2.0 model """ @@ -70,17 +75,21 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i import tensorflow as tf import torch except ImportError as e: - logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " - "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) raise e pt_path = os.path.abspath(pytorch_checkpoint_path) logger.info("Loading PyTorch weights from {}".format(pt_path)) - pt_state_dict = torch.load(pt_path, map_location='cpu') + pt_state_dict = torch.load(pt_path, map_location="cpu") logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values()))) - return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys) + return load_pytorch_weights_in_tf2_model( + tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys + ) def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False): @@ -88,7 +97,9 @@ def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_mi """ pt_state_dict = pt_model.state_dict() - return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys) + return load_pytorch_weights_in_tf2_model( + tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys + ) def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False): @@ -99,8 +110,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a import tensorflow as tf from tensorflow.python.keras import backend as K except ImportError as e: - logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " - "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) raise e if tf_inputs is None: @@ -115,10 +128,10 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a new_keys = [] for key in pt_state_dict.keys(): new_key = None - if 'gamma' in key: - new_key = key.replace('gamma', 'weight') - if 'beta' in key: - new_key = key.replace('beta', 'bias') + if "gamma" in key: + new_key = key.replace("gamma", "weight") + if "beta" in key: + new_key = key.replace("beta", "bias") if new_key: old_keys.append(key) new_keys.append(new_key) @@ -127,9 +140,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a # Make sure we are able to load PyTorch base models as well as derived models (with heads) # TF models always have a prefix, some of PyTorch models (base ones) don't - start_prefix_to_remove = '' + start_prefix_to_remove = "" if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()): - start_prefix_to_remove = tf_model.base_model_prefix + '.' + start_prefix_to_remove = tf_model.base_model_prefix + "." symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights tf_loaded_numel = 0 @@ -137,7 +150,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a all_pytorch_weights = set(list(pt_state_dict.keys())) for symbolic_weight in symbolic_weights: sw_name = symbolic_weight.name - name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove) + name, transpose = convert_tf_weight_name_to_pt_weight_name( + sw_name, start_prefix_to_remove=start_prefix_to_remove + ) # Find associated numpy array in pytorch model state dict if name not in pt_state_dict: @@ -182,6 +197,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ##################### ### TF 2.0 => PyTorch + def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False): """ Load TF 2.0 HDF5 checkpoint in a PyTorch model We use HDF5 to easily do transfer learning @@ -191,8 +207,10 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs import tensorflow as tf import torch except ImportError as e: - logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " - "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) raise e import transformers @@ -215,6 +233,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys) + def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False): """ Load TF 2.0 model in a pytorch model """ @@ -230,8 +249,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F import tensorflow as tf import torch except ImportError as e: - logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " - "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) raise e new_pt_params_dict = {} @@ -239,14 +260,16 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F # Make sure we are able to load PyTorch base models as well as derived models (with heads) # TF models always have a prefix, some of PyTorch models (base ones) don't - start_prefix_to_remove = '' + start_prefix_to_remove = "" if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()): - start_prefix_to_remove = pt_model.base_model_prefix + '.' + start_prefix_to_remove = pt_model.base_model_prefix + "." # Build a map from potential PyTorch weight names to TF 2.0 Variables tf_weights_map = {} for tf_weight in tf_weights: - pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(tf_weight.name, start_prefix_to_remove=start_prefix_to_remove) + pt_name, transpose = convert_tf_weight_name_to_pt_weight_name( + tf_weight.name, start_prefix_to_remove=start_prefix_to_remove + ) tf_weights_map[pt_name] = (tf_weight.numpy(), transpose) all_tf_weights = set(list(tf_weights_map.keys())) @@ -291,11 +314,13 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F missing_keys += missing_keys_pt if len(missing_keys) > 0: - logger.info("Weights of {} not initialized from TF 2.0 model: {}".format( - pt_model.__class__.__name__, missing_keys)) + logger.info( + "Weights of {} not initialized from TF 2.0 model: {}".format(pt_model.__class__.__name__, missing_keys) + ) if len(unexpected_keys) > 0: - logger.info("Weights from TF 2.0 model not used in {}: {}".format( - pt_model.__class__.__name__, unexpected_keys)) + logger.info( + "Weights from TF 2.0 model not used in {}: {}".format(pt_model.__class__.__name__, unexpected_keys) + ) logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights)) diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py index 15282bd6cc..136ab66157 100644 --- a/transformers/modeling_tf_roberta.py +++ b/transformers/modeling_tf_roberta.py @@ -15,8 +15,7 @@ # limitations under the License. """ TF 2.0 RoBERTa model. """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging @@ -31,16 +30,18 @@ from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new logger = logging.getLogger(__name__) TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5", - 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5", - 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5", - 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5", + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5", } + class TFRobertaEmbeddings(TFBertEmbeddings): """ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. """ + def __init__(self, config, **kwargs): super(TFRobertaEmbeddings, self).__init__(config, **kwargs) self.padding_idx = 1 @@ -64,9 +65,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): """ seq_length = shape_list(inputs_embeds)[1] - position_ids = tf.range(self.padding_idx + 1, - seq_length + self.padding_idx + 1, - dtype=tf.int32)[tf.newaxis, :] + position_ids = tf.range(self.padding_idx + 1, seq_length + self.padding_idx + 1, dtype=tf.int32)[tf.newaxis, :] return position_ids def _embedding(self, inputs, training=False): @@ -80,16 +79,19 @@ class TFRobertaEmbeddings(TFBertEmbeddings): else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) + return super(TFRobertaEmbeddings, self)._embedding( + [input_ids, position_ids, token_type_ids, inputs_embeds], training=training + ) class TFRobertaMainLayer(TFBertMainLayer): """ Same as TFBertMainLayer but uses TFRobertaEmbeddings. """ + def __init__(self, config, **kwargs): super(TFRobertaMainLayer, self).__init__(config, **kwargs) - self.embeddings = TFRobertaEmbeddings(config, name='embeddings') + self.embeddings = TFRobertaEmbeddings(config, name="embeddings") def get_input_embeddings(self): return self.embeddings @@ -99,6 +101,7 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = RobertaConfig pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "roberta" @@ -192,8 +195,12 @@ ROBERTA_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class TFRobertaModel(TFRobertaPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -226,9 +233,10 @@ class TFRobertaModel(TFRobertaPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFRobertaModel, self).__init__(config, *inputs, **kwargs) - self.roberta = TFRobertaMainLayer(config, name='roberta') + self.roberta = TFRobertaMainLayer(config, name="roberta") def call(self, inputs, **kwargs): outputs = self.roberta(inputs, **kwargs) @@ -237,13 +245,14 @@ class TFRobertaModel(TFRobertaPreTrainedModel): class TFRobertaLMHead(tf.keras.layers.Layer): """Roberta Head for masked language modeling.""" + def __init__(self, config, input_embeddings, **kwargs): super(TFRobertaLMHead, self).__init__(**kwargs) self.vocab_size = config.vocab_size - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer(config.initializer_range), - name='dense') - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm') + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.act = tf.keras.layers.Activation(gelu) # The output weights are the same as the input embeddings, but there is @@ -251,10 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): self.decoder = input_embeddings def build(self, input_shape): - self.bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='bias') + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super(TFRobertaLMHead, self).build(input_shape) def call(self, features): @@ -268,8 +274,9 @@ class TFRobertaLMHead(tf.keras.layers.Layer): return x -@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +@add_start_docstrings( + """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING +) class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -297,6 +304,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): prediction_scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs) @@ -322,14 +330,16 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFRobertaClassificationHead, self).__init__(config, **kwargs) - self.dense = tf.keras.layers.Dense(config.hidden_size, - kernel_initializer=get_initializer(config.initializer_range), - activation='tanh', - name="dense") + self.dense = tf.keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.out_proj = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name="out_proj") + self.out_proj = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) def call(self, features, training=False): x = features[:, 0, :] # take token (equiv. to [CLS]) @@ -340,9 +350,12 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): return x -@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer +@add_start_docstrings( + """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -369,27 +382,31 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") self.classifier = TFRobertaClassificationHead(config, name="classifier") - + def call(self, inputs, **kwargs): outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] - logits = self.classifier(sequence_output, training=kwargs.get('training', False)) + logits = self.classifier(sequence_output, training=kwargs.get("training", False)) outputs = (logits,) + outputs[2:] return outputs # logits, (hidden_states), (attentions) -@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) + ROBERTA_START_DOCSTRING, + ROBERTA_INPUTS_DOCSTRING, +) class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -415,22 +432,23 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.roberta = TFRobertaMainLayer(config, name='roberta') + self.roberta = TFRobertaMainLayer(config, name="roberta") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): outputs = self.roberta(inputs, **kwargs) sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False)) + sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False)) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index e803e00c8d..38a2bf4190 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -31,11 +31,11 @@ from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK logger = logging.getLogger(__name__) TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { - 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5", - 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5", - 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5", - 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5", - 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5", + "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5", + "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5", + "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5", + "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5", + "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5", } #################################################### @@ -44,6 +44,7 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model) #################################################### + class TFT5LayerNorm(tf.keras.layers.Layer): def __init__(self, epsilon=1e-6, **kwargs): """ Construct a layernorm module in the T5 style @@ -54,10 +55,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer): def build(self, input_shape): """Build shared word embedding layer """ - self.weight = self.add_weight( - "weight", - shape=(input_shape[-1],), - initializer='ones') + self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones") super(TFT5LayerNorm, self).build(input_shape) def call(self, x): @@ -69,8 +67,8 @@ class TFT5LayerNorm(tf.keras.layers.Layer): class TFT5DenseReluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFT5DenseReluDense, self).__init__(**kwargs) - self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi') - self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo') + self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi") + self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) self.act = tf.keras.activations.relu @@ -85,9 +83,8 @@ class TFT5DenseReluDense(tf.keras.layers.Layer): class TFT5LayerFF(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFT5LayerFF, self).__init__(**kwargs) - self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense') - self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, - name='layer_norm') + self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense") + self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call(self, hidden_states, training=False): @@ -114,26 +111,23 @@ class TFT5Attention(tf.keras.layers.Layer): self.inner_dim = self.n_heads * self.d_kv # Mesh TensorFlow initialization to avoid scaling before softmax - self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q') - self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k') - self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v') - self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o') + self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="q") + self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="k") + self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name="v") + self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name="o") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) if self.has_relative_attention_bias: - self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets, - self.n_heads, - name='relative_attention_bias') + self.relative_attention_bias = tf.keras.layers.Embedding( + self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias" + ) self.pruned_heads = set() def prune_heads(self, heads): raise NotImplementedError @staticmethod - def _relative_position_bucket(relative_position, - bidirectional=True, - num_buckets=32, - max_distance=128): + def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128): """ Adapted from Mesh Tensorflow: https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 @@ -170,7 +164,10 @@ class TFT5Attention(tf.keras.layers.Layer): is_small = tf.math.less(n, max_exact) val_if_large = max_exact + tf.dtypes.cast( tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact) - / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact), + tf.int32, + ) val_if_large = tf.math.minimum(val_if_large, num_buckets - 1) ret += tf.where(is_small, n, val_if_large) return ret @@ -180,11 +177,11 @@ class TFT5Attention(tf.keras.layers.Layer): context_position = tf.range(qlen)[:, None] memory_position = tf.range(klen)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) - rp_bucket = self._relative_position_bucket(relative_position, - bidirectional=not self.is_decoder, - num_buckets=self.relative_attention_num_buckets) + rp_bucket = self._relative_position_bucket( + relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets + ) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) - values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen) + values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen) return values def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False): @@ -195,7 +192,7 @@ class TFT5Attention(tf.keras.layers.Layer): # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = shape_list(input) if kv is None: - klen = qlen if cache is None else cache['slen'] + qlen + klen = qlen if cache is None else cache["slen"] + qlen else: klen = shape_list(kv)[1] @@ -207,28 +204,28 @@ class TFT5Attention(tf.keras.layers.Layer): """ compute context """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim)) - q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) + q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: - k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv - k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] - k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) - v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) + k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) + v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) # q = q / math.sqrt(dim_per_head) # No scaling in T5 # scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) - scores = tf.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen) + scores = tf.einsum("bnqd,bnkd->bnqk", q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: @@ -240,15 +237,15 @@ class TFT5Attention(tf.keras.layers.Layer): # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) scores += position_bias - weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) - weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask - context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) - context = unshape(context) # (bs, qlen, dim) + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) context = self.o(context) @@ -263,21 +260,17 @@ class TFT5Attention(tf.keras.layers.Layer): class TFT5LayerSelfAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super(TFT5LayerSelfAttention, self).__init__(**kwargs) - self.SelfAttention = TFT5Attention(config, - has_relative_attention_bias=has_relative_attention_bias, - name='SelfAttention') - self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, - name='layer_norm') + self.SelfAttention = TFT5Attention( + config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention" + ) + self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) - def call(self, hidden_states, attention_mask=None, position_bias=None, - head_mask=None, training=False): + def call(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, training=False): norm_x = self.layer_norm(hidden_states) - attention_output = self.SelfAttention(norm_x, - mask=attention_mask, - position_bias=position_bias, - head_mask=head_mask, - training=training) + attention_output = self.SelfAttention( + norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, training=training + ) y = attention_output[0] layer_output = hidden_states + self.dropout(y, training=training) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -287,22 +280,17 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer): class TFT5LayerCrossAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): super(TFT5LayerCrossAttention, self).__init__(**kwargs) - self.EncDecAttention = TFT5Attention(config, - has_relative_attention_bias=has_relative_attention_bias, - name='EncDecAttention') - self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, - name='layer_norm') + self.EncDecAttention = TFT5Attention( + config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention" + ) + self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) - def call(self, hidden_states, kv, attention_mask=None, position_bias=None, - head_mask=None, training=False): + def call(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, training=False): norm_x = self.layer_norm(hidden_states) - attention_output = self.EncDecAttention(norm_x, - mask=attention_mask, - kv=kv, - position_bias=position_bias, - head_mask=head_mask, - training=training) + attention_output = self.EncDecAttention( + norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, training=training + ) y = attention_output[0] layer_output = hidden_states + self.dropout(y, training=training) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them @@ -314,43 +302,57 @@ class TFT5Block(tf.keras.layers.Layer): super(TFT5Block, self).__init__(**kwargs) self.is_decoder = config.is_decoder self.layer = [] - self.layer.append(TFT5LayerSelfAttention(config, - has_relative_attention_bias=has_relative_attention_bias, - name='layer_._0')) + self.layer.append( + TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0") + ) if self.is_decoder: - self.layer.append(TFT5LayerCrossAttention(config, - has_relative_attention_bias=has_relative_attention_bias, - name='layer_._1')) - self.layer.append(TFT5LayerFF(config, name='layer_._2')) + self.layer.append( + TFT5LayerCrossAttention( + config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1" + ) + ) + self.layer.append(TFT5LayerFF(config, name="layer_._2")) else: - self.layer.append(TFT5LayerFF(config, name='layer_._1')) + self.layer.append(TFT5LayerFF(config, name="layer_._1")) - def call(self, hidden_states, attention_mask=None, position_bias=None, - encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, - head_mask=None, training=False): - self_attention_outputs = self.layer[0](hidden_states, - attention_mask=attention_mask, - position_bias=position_bias, - head_mask=head_mask, - training=training) + def call( + self, + hidden_states, + attention_mask=None, + position_bias=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + encoder_decoder_position_bias=None, + head_mask=None, + training=False, + ): + self_attention_outputs = self.layer[0]( + hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + head_mask=head_mask, + training=training, + ) hidden_states = self_attention_outputs[0] outputs = self_attention_outputs[1:] if not self.is_decoder: hidden_states = self.layer[1](hidden_states, training=training) else: - cross_attention_outputs = self.layer[1](hidden_states, - kv=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - head_mask=head_mask, - training=training) + cross_attention_outputs = self.layer[1]( + hidden_states, + kv=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + head_mask=head_mask, + training=training, + ) hidden_states = cross_attention_outputs[0] outputs = outputs + cross_attention_outputs[1:] hidden_states = self.layer[2](hidden_states, training=training) outputs = (hidden_states,) + outputs # add attentions if we output them - return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) + return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) #################################################### @@ -366,12 +368,11 @@ class TFT5MainLayer(tf.keras.layers.Layer): self.config = config self.num_hidden_layers = config.num_layers - self.block = [TFT5Block(config, - has_relative_attention_bias=bool(i == 0), - name='block_._{}'.format(i)) - for i in range(config.num_layers)] - self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, - name='final_layer_norm') + self.block = [ + TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i)) + for i in range(config.num_layers) + ] + self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def _resize_token_embeddings(self, new_num_tokens): @@ -380,8 +381,15 @@ class TFT5MainLayer(tf.keras.layers.Layer): def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models - def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None, - encoder_attention_mask=None, head_mask=None, training=False): + def call( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + training=False, + ): batch_size, seq_length = shape_list(hidden_states)[:2] if attention_mask is None: @@ -397,13 +405,14 @@ class TFT5MainLayer(tf.keras.layers.Layer): if num_dims_attention_mask == 3: extended_attention_mask = attention_mask[:, None, :, :] elif num_dims_attention_mask == 2: - # Provided a padding mask of dimensions [batch_size, seq_length] - # - if the model is a decoder, apply a causal mask in addition to the padding mask - # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder: seq_ids = tf.range(seq_length) - causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), - seq_ids[None, :, None]) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), seq_ids[None, :, None] + ) causal_mask = tf.cast(causal_mask, dtype=tf.float32) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: @@ -460,14 +469,16 @@ class TFT5MainLayer(tf.keras.layers.Layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - layer_outputs = layer_module(hidden_states, - attention_mask=extended_attention_mask, - position_bias=position_bias, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - encoder_decoder_position_bias=encoder_decoder_position_bias, - head_mask=head_mask[i], - training=training) + layer_outputs = layer_module( + hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + head_mask=head_mask[i], + training=training, + ) hidden_states = layer_outputs[0] if i == 0: # We share the position biases between the layers - the first layer store them @@ -505,6 +516,7 @@ class TFT5PreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = T5Config pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -513,9 +525,11 @@ class TFT5PreTrainedModel(TFPreTrainedModel): def dummy_inputs(self): input_ids = tf.constant(DUMMY_INPUTS) input_mask = tf.constant(DUMMY_MASK) - dummy_inputs = {'decoder_input_ids': input_ids, - 'encoder_input_ids': input_ids, - 'decoder_attention_mask': input_mask} + dummy_inputs = { + "decoder_input_ids": input_ids, + "encoder_input_ids": input_ids, + "decoder_attention_mask": input_mask, + } return dummy_inputs @@ -586,9 +600,12 @@ T5_INPUTS_DOCSTRING = r""" ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ -@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states" - "without any specific head on top.", - T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.", + T5_START_DOCSTRING, + T5_INPUTS_DOCSTRING, +) class TFT5Model(TFT5PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -614,17 +631,17 @@ class TFT5Model(TFT5PreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFT5Model, self).__init__(config, *inputs, **kwargs) - self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, - name='shared') + self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") encoder_config = copy.deepcopy(config) - self.encoder = TFT5MainLayer(encoder_config, name='encoder') + self.encoder = TFT5MainLayer(encoder_config, name="encoder") decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True - self.decoder = TFT5MainLayer(decoder_config, name='decoder') + self.decoder = TFT5MainLayer(decoder_config, name="decoder") def get_input_embeddings(self): return self.shared @@ -641,14 +658,15 @@ class TFT5Model(TFT5PreTrainedModel): if isinstance(decoder_input_ids, dict): kwargs.update(decoder_input_ids) else: - kwargs['decoder_input_ids'] = decoder_input_ids + kwargs["decoder_input_ids"] = decoder_input_ids - kwargs_common = dict((k, v) for k, v in kwargs.items() - if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_common = dict( + (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_") + ) kwargs_encoder = kwargs_common.copy() kwargs_decoder = kwargs_common.copy() - kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) - kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) + kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_"))) # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) @@ -678,8 +696,7 @@ class TFT5Model(TFT5PreTrainedModel): return decoder_outputs + encoder_outputs -@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, - T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) +@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) class TFT5WithLMHeadModel(TFT5PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -705,19 +722,19 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel): prediction_scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs) self.model_dim = config.d_model - self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, - name='shared') + self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") encoder_config = copy.deepcopy(config) - self.encoder = TFT5MainLayer(encoder_config, name='encoder') + self.encoder = TFT5MainLayer(encoder_config, name="encoder") decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True - self.decoder = TFT5MainLayer(decoder_config, name='decoder') + self.decoder = TFT5MainLayer(decoder_config, name="decoder") def get_input_embeddings(self): return self.shared @@ -734,14 +751,15 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel): if isinstance(decoder_input_ids, dict): kwargs.update(decoder_input_ids) else: - kwargs['decoder_input_ids'] = decoder_input_ids + kwargs["decoder_input_ids"] = decoder_input_ids - kwargs_common = dict((k, v) for k, v in kwargs.items() - if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_common = dict( + (k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_") + ) kwargs_encoder = kwargs_common.copy() kwargs_decoder = kwargs_common.copy() - kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) - kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) + kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_"))) # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py index 08bbe74032..fc7ea932aa 100644 --- a/transformers/modeling_tf_transfo_xl.py +++ b/transformers/modeling_tf_transfo_xl.py @@ -37,9 +37,10 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5", + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5", } + class TFPositionalEmbedding(tf.keras.layers.Layer): def __init__(self, demb, **kwargs): super(TFPositionalEmbedding, self).__init__(**kwargs) @@ -47,7 +48,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer): self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb)) def call(self, pos_seq, bsz=None): - sinusoid_inp = tf.einsum('i,j->ij', pos_seq, self.inv_freq) + sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq) pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1) if bsz is not None: @@ -64,17 +65,14 @@ class TFPositionwiseFF(tf.keras.layers.Layer): self.d_inner = d_inner self.dropout = dropout - self.layer_1 = tf.keras.layers.Dense(d_inner, - kernel_initializer=get_initializer(init_std), - activation=tf.nn.relu, - name='CoreNet_._0') + self.layer_1 = tf.keras.layers.Dense( + d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0" + ) self.drop_1 = tf.keras.layers.Dropout(dropout) - self.layer_2 = tf.keras.layers.Dense(d_model, - kernel_initializer=get_initializer(init_std), - name='CoreNet_._3') + self.layer_2 = tf.keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3") self.drop_2 = tf.keras.layers.Dropout(dropout) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm') + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.pre_lnorm = pre_lnorm @@ -103,10 +101,24 @@ class TFPositionwiseFF(tf.keras.layers.Layer): class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): - def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, - tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, - r_r_bias=None, r_w_bias=None, output_attentions=False, - layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): + def __init__( + self, + n_head, + d_model, + d_head, + dropout, + dropatt=0, + tgt_len=None, + ext_len=None, + mem_len=None, + pre_lnorm=False, + r_r_bias=None, + r_w_bias=None, + output_attentions=False, + layer_norm_epsilon=1e-5, + init_std=0.02, + **kwargs + ): super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs) self.output_attentions = output_attentions @@ -115,46 +127,41 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): self.d_head = d_head self.dropout = dropout - self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head, - kernel_initializer=get_initializer(init_std), - use_bias=False, - name='qkv_net') + self.qkv_net = tf.keras.layers.Dense( + 3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net" + ) self.drop = tf.keras.layers.Dropout(dropout) self.dropatt = tf.keras.layers.Dropout(dropatt) - self.o_net = tf.keras.layers.Dense(d_model, - kernel_initializer=get_initializer(init_std), - use_bias=False, - name='o_net') + self.o_net = tf.keras.layers.Dense( + d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net" + ) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm') + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") self.scale = 1 / (d_head ** 0.5) self.pre_lnorm = pre_lnorm - if r_r_bias is not None and r_w_bias is not None: # Biases are shared + if r_r_bias is not None and r_w_bias is not None: # Biases are shared self.r_r_bias = r_r_bias self.r_w_bias = r_w_bias else: self.r_r_bias = None self.r_w_bias = None - self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head, - kernel_initializer=get_initializer(init_std), - use_bias=False, - name='r_net') + self.r_net = tf.keras.layers.Dense( + self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net" + ) def build(self, input_shape): - if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared - self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer='zeros', - trainable=True, - name='r_r_bias') - self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer='zeros', - trainable=True, - name='r_w_bias') + if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared + self.r_r_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" + ) + self.r_w_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" + ) super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape) def _rel_shift(self, x): @@ -196,14 +203,14 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head - r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head + r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head #### compute attention score - rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head - AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k) # qlen x klen x bsz x n_head + rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head + AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k) # qlen x klen x bsz x n_head rr_head_q = w_head_q + self.r_r_bias - BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k) # qlen x klen x bsz x n_head + BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k) # qlen x klen x bsz x n_head BD = self._rel_shift(BD) # [qlen x klen x bsz x n_head] @@ -224,12 +231,11 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): attn_prob = attn_prob * head_mask #### compute attention vector - attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v) + attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec_sizes = shape_list(attn_vec) - attn_vec = tf.reshape(attn_vec, - (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head)) + attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head)) ##### linear projection attn_out = self.o_net(attn_vec) @@ -249,32 +255,57 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): - def __init__(self, n_head, d_model, d_head, d_inner, dropout, - tgt_len=None, ext_len=None, mem_len=None, - dropatt=0., pre_lnorm=False, - r_w_bias=None, - r_r_bias=None, - output_attentions=False, - layer_norm_epsilon=1e-5, - init_std=0.02, - **kwargs): + def __init__( + self, + n_head, + d_model, + d_head, + d_inner, + dropout, + tgt_len=None, + ext_len=None, + mem_len=None, + dropatt=0.0, + pre_lnorm=False, + r_w_bias=None, + r_r_bias=None, + output_attentions=False, + layer_norm_epsilon=1e-5, + init_std=0.02, + **kwargs + ): super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs) - self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model, - d_head, dropout, tgt_len=tgt_len, ext_len=ext_len, - mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm, - r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std, - output_attentions=output_attentions, - layer_norm_epsilon=layer_norm_epsilon, name='dec_attn') - self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout, - pre_lnorm=pre_lnorm, init_std=init_std, - layer_norm_epsilon=layer_norm_epsilon, - name='pos_ff') + self.dec_attn = TFRelPartialLearnableMultiHeadAttn( + n_head, + d_model, + d_head, + dropout, + tgt_len=tgt_len, + ext_len=ext_len, + mem_len=mem_len, + dropatt=dropatt, + pre_lnorm=pre_lnorm, + r_w_bias=r_w_bias, + r_r_bias=r_r_bias, + init_std=init_std, + output_attentions=output_attentions, + layer_norm_epsilon=layer_norm_epsilon, + name="dec_attn", + ) + self.pos_ff = TFPositionwiseFF( + d_model, + d_inner, + dropout, + pre_lnorm=pre_lnorm, + init_std=init_std, + layer_norm_epsilon=layer_norm_epsilon, + name="pos_ff", + ) def call(self, inputs, training=False): dec_inp, r, dec_attn_mask, mems, head_mask = inputs - attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask, - mems, head_mask], training=training) + attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask, mems, head_mask], training=training) ff_output = self.pos_ff(attn_outputs[0], training=training) outputs = [ff_output] + attn_outputs[1:] @@ -283,8 +314,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): class TFAdaptiveEmbedding(tf.keras.layers.Layer): - def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, - sample_softmax=False, **kwargs): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): super(TFAdaptiveEmbedding, self).__init__(**kwargs) self.n_token = n_token @@ -305,20 +335,28 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint else: for i in range(len(self.cutoffs)): - l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) - self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx, - d_emb_i, - embeddings_initializer=get_initializer(init_std), - name='emb_layers_._{}'.format(i))) + self.emb_layers.append( + tf.keras.layers.Embedding( + r_idx - l_idx, + d_emb_i, + embeddings_initializer=get_initializer(init_std), + name="emb_layers_._{}".format(i), + ) + ) def build(self, input_shape): for i in range(len(self.cutoffs)): d_emb_i = self.d_embed // (self.div_val ** i) - self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj), - initializer=get_initializer(self.init_std), - trainable=True, - name='emb_projs_._{}'.format(i))) + self.emb_projs.append( + self.add_weight( + shape=(d_emb_i, self.d_proj), + initializer=get_initializer(self.init_std), + trainable=True, + name="emb_projs_._{}".format(i), + ) + ) super(TFAdaptiveEmbedding, self).build(input_shape) def call(self, inp): @@ -334,7 +372,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx emb_i = self.emb_layers[i](inp_i) - emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i]) + emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i]) mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64) emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64)) @@ -361,8 +399,15 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): self.d_head = config.d_head self.untie_r = config.untie_r - self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, - div_val=config.div_val, init_std=config.init_std, name='word_emb') + self.word_emb = TFAdaptiveEmbedding( + config.vocab_size, + config.d_embed, + config.d_model, + config.cutoffs, + div_val=config.div_val, + init_std=config.init_std, + name="word_emb", + ) self.drop = tf.keras.layers.Dropout(config.dropout) @@ -376,41 +421,47 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): self.attn_type = config.attn_type self.layers = [] - if config.attn_type == 0: # the default attention + if config.attn_type == 0: # the default attention for i in range(config.n_layer): self.layers.append( TFRelPartialLearnableDecoderLayer( - config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, - tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, - dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, + config.n_head, + config.d_model, + config.d_head, + config.d_inner, + config.dropout, + tgt_len=config.tgt_len, + ext_len=config.ext_len, + mem_len=config.mem_len, + dropatt=config.dropatt, + pre_lnorm=config.pre_lnorm, r_w_bias=None if self.untie_r else self.r_w_bias, r_r_bias=None if self.untie_r else self.r_r_bias, output_attentions=self.output_attentions, layer_norm_epsilon=config.layer_norm_epsilon, init_std=config.init_std, - name='layers_._{}'.format(i)) + name="layers_._{}".format(i), + ) ) - else: # learnable embeddings and absolute embeddings + else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint self.same_length = config.same_length self.clamp_len = config.clamp_len - if self.attn_type == 0: # default attention - self.pos_emb = TFPositionalEmbedding(self.d_model, name='pos_emb') - else: # learnable embeddings and absolute embeddings + if self.attn_type == 0: # default attention + self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb") + else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint def build(self, input_shape): if not self.untie_r: - self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer='zeros', - trainable=True, - name='r_w_bias') - self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer='zeros', - trainable=True, - name='r_r_bias') + self.r_w_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" + ) + self.r_r_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" + ) super(TFTransfoXLMainLayer, self).build(input_shape) def get_input_embeddings(self): @@ -443,10 +494,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): def _update_mems(self, hids, mems, qlen, mlen): # does not deal with None - if mems is None: return None + if mems is None: + return None # mems is not None - assert len(hids) == len(mems), 'len(hids) != len(mems)' + assert len(hids) == len(mems), "len(hids) != len(mems)" # There are `mlen + qlen` steps that can be cached into mems # For the next step, the last `ext_len` of the `qlen` tokens @@ -472,10 +524,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - mems = inputs.get('mems', mems) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + mems = inputs.get("mems", mems) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs @@ -521,8 +573,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) if self.same_length: mask_l = tf.linalg.band_part(attn_mask, -1, 0) - dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, - dec_attn_mask[:, qlen:]], 1) + dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia, dec_attn_mask[:, qlen:]], 1) # ::: PyTorch masking code for reference ::: # if self.same_length: # all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) @@ -539,8 +590,8 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): hids = [] attentions = [] - if self.attn_type == 0: # default - pos_seq = tf.range(klen-1, -1, -1.0) + if self.attn_type == 0: # default + pos_seq = tf.range(klen - 1, -1, -1.0) if self.clamp_len > 0: pos_seq = tf.minimum(pos_seq, self.clamp_len) pos_emb = self.pos_emb(pos_seq) @@ -551,12 +602,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): for i, layer in enumerate(self.layers): hids.append(core_out) mems_i = None if mems is None else mems[i] - layer_outputs = layer([core_out, pos_emb, dec_attn_mask, - mems_i, head_mask[i]], training=training) + layer_outputs = layer([core_out, pos_emb, dec_attn_mask, mems_i, head_mask[i]], training=training) core_out = layer_outputs[0] if self.output_attentions: attentions.append(layer_outputs[1]) - else: # learnable embeddings and absolute embeddings + else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint core_out = self.drop(core_out, training=training) @@ -581,6 +631,7 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = TransfoXLConfig pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -647,8 +698,12 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", - TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", + TRANSFO_XL_START_DOCSTRING, + TRANSFO_XL_INPUTS_DOCSTRING, +) class TFTransfoXLModel(TFTransfoXLPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -678,18 +733,22 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): last_hidden_states, mems = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFTransfoXLMainLayer(config, name='transformer') + self.transformer = TFTransfoXLMainLayer(config, name="transformer") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) return outputs -@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top +@add_start_docstrings( + """The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive input embeddings)""", - TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING) + TRANSFO_XL_START_DOCSTRING, + TRANSFO_XL_INPUTS_DOCSTRING, +) class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -720,17 +779,19 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): prediction_scores, mems = outputs[:2] """ + def __init__(self, config): super(TFTransfoXLLMHeadModel, self).__init__(config) - self.transformer = TFTransfoXLMainLayer(config, name='transformer') + self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.sample_softmax = config.sample_softmax # use sampled softmax if config.sample_softmax > 0: raise NotImplementedError # use adaptive softmax (including standard softmax) else: - self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, - config.cutoffs, div_val=config.div_val, name='crit') + self.crit = TFAdaptiveSoftmaxMask( + config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit" + ) def reset_length(self, tgt_len, ext_len, mem_len): self.transformer.reset_length(tgt_len, ext_len, mem_len) @@ -747,11 +808,11 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): labels = inputs[4] if len(inputs) > 4 else labels assert len(inputs) <= 5, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - mems = inputs.get('mems', mems) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) - labels = inputs.get('labels', labels) + input_ids = inputs.get("input_ids") + mems = inputs.get("mems", mems) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + labels = inputs.get("labels", labels) assert len(inputs) <= 5, "Too many inputs." else: input_ids = inputs diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py index f730af851f..0f2a4ebeba 100644 --- a/transformers/modeling_tf_transfo_xl_utilities.py +++ b/transformers/modeling_tf_transfo_xl_utilities.py @@ -24,9 +24,9 @@ import tensorflow as tf from .modeling_tf_utils import shape_list + class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): - def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, - keep_order=False, **kwargs): + def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs) self.vocab_size = vocab_size @@ -47,52 +47,59 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): def build(self, input_shape): if self.n_clusters > 0: - self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed), - initializer='zeros', - trainable=True, - name='cluster_weight') - self.cluster_bias = self.add_weight(shape=(self.n_clusters,), - initializer='zeros', - trainable=True, - name='cluster_bias') + self.cluster_weight = self.add_weight( + shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight" + ) + self.cluster_bias = self.add_weight( + shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias" + ) if self.div_val == 1: for i in range(len(self.cutoffs)): if self.d_proj != self.d_embed: - weight = self.add_weight(shape=(self.d_embed, self.d_proj), - initializer='zeros', - trainable=True, - name='out_projs_._{}'.format(i)) + weight = self.add_weight( + shape=(self.d_embed, self.d_proj), + initializer="zeros", + trainable=True, + name="out_projs_._{}".format(i), + ) self.out_projs.append(weight) else: self.out_projs.append(None) - weight = self.add_weight(shape=(self.vocab_size, self.d_embed,), - initializer='zeros', - trainable=True, - name='out_layers_._{}_._weight'.format(i)) - bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='out_layers_._{}_._bias'.format(i)) + weight = self.add_weight( + shape=(self.vocab_size, self.d_embed,), + initializer="zeros", + trainable=True, + name="out_layers_._{}_._weight".format(i), + ) + bias = self.add_weight( + shape=(self.vocab_size,), + initializer="zeros", + trainable=True, + name="out_layers_._{}_._bias".format(i), + ) self.out_layers.append((weight, bias)) else: for i in range(len(self.cutoffs)): - l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = self.d_embed // (self.div_val ** i) - weight = self.add_weight(shape=(d_emb_i, self.d_proj), - initializer='zeros', - trainable=True, - name='out_projs_._{}'.format(i)) + weight = self.add_weight( + shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name="out_projs_._{}".format(i) + ) self.out_projs.append(weight) - weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,), - initializer='zeros', - trainable=True, - name='out_layers_._{}_._weight'.format(i)) - bias = self.add_weight(shape=(r_idx-l_idx,), - initializer='zeros', - trainable=True, - name='out_layers_._{}_._bias'.format(i)) + weight = self.add_weight( + shape=(r_idx - l_idx, d_emb_i,), + initializer="zeros", + trainable=True, + name="out_layers_._{}_._weight".format(i), + ) + bias = self.add_weight( + shape=(r_idx - l_idx,), + initializer="zeros", + trainable=True, + name="out_layers_._{}_._bias".format(i), + ) self.out_layers.append((weight, bias)) super(TFAdaptiveSoftmaxMask, self).build(input_shape) @@ -100,8 +107,8 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): def _logit(x, W, b, proj=None): y = x if proj is not None: - y = tf.einsum('ibd,ed->ibe', y, proj) - return tf.einsum('ibd,nd->ibn', y, W) + b + y = tf.einsum("ibd,ed->ibe", y, proj) + return tf.einsum("ibd,nd->ibn", y, W) + b @staticmethod def _gather_logprob(logprob, target): @@ -114,7 +121,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): hidden, target = inputs head_logprob = 0 if self.n_clusters == 0: - softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer()) + softmax_b = tf.get_variable("bias", [self.config.vocab_size], initializer=tf.zeros_initializer()) output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) if target is not None: loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) @@ -143,7 +150,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0]) head_logprob = tf.nn.log_softmax(head_logit) - out.append(head_logprob[..., :self.cutoffs[0]]) + out.append(head_logprob[..., : self.cutoffs[0]]) if target is not None: cur_head_logprob = tf.boolean_mask(head_logprob, mask) cur_logprob = self._gather_logprob(cur_head_logprob, cur_target) @@ -170,6 +177,6 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): # Log the loss as a metric (we could log arbitrary metrics, # including different metrics for training and inference. - self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '') + self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "") return out diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index 0aa65a9f17..7ecd79afd3 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -15,8 +15,7 @@ # limitations under the License. """TF general model utils.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging import os @@ -26,12 +25,20 @@ from tensorflow.python.keras.saving import hdf5_format import h5py from .configuration_utils import PretrainedConfig -from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS, - cached_path, hf_bucket_url, is_remote_url) +from .file_utils import ( + TF2_WEIGHTS_NAME, + TF_WEIGHTS_NAME, + WEIGHTS_NAME, + DUMMY_INPUTS, + cached_path, + hf_bucket_url, + is_remote_url, +) from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model logger = logging.getLogger(__name__) + class TFPreTrainedModel(tf.keras.Model): r""" Base class for all TF models. @@ -60,7 +67,7 @@ class TFPreTrainedModel(tf.keras.Model): Returns: tf.Tensor with dummy inputs """ - return {'input_ids': tf.constant(DUMMY_INPUTS)} + return {"input_ids": tf.constant(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): super(TFPreTrainedModel, self).__init__(*inputs, **kwargs) @@ -70,7 +77,8 @@ class TFPreTrainedModel(tf.keras.Model): "To create a model from a pretrained model use " "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ - )) + ) + ) # Save config in model self.config = config @@ -151,7 +159,9 @@ class TFPreTrainedModel(tf.keras.Model): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. """ - assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" + assert os.path.isdir( + save_directory + ), "Saving path should be a directory where the model and configuration can be saved" # Save configuration file self.config.save_pretrained(save_directory) @@ -230,20 +240,22 @@ class TFPreTrainedModel(tf.keras.Model): model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config) """ - config = kwargs.pop('config', None) - cache_dir = kwargs.pop('cache_dir', None) - from_pt = kwargs.pop('from_pt', False) - force_download = kwargs.pop('force_download', False) - resume_download = kwargs.pop('resume_download', False) - proxies = kwargs.pop('proxies', None) - output_loading_info = kwargs.pop('output_loading_info', False) + config = kwargs.pop("config", None) + cache_dir = kwargs.pop("cache_dir", None) + from_pt = kwargs.pop("from_pt", False) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + output_loading_info = kwargs.pop("output_loading_info", False) # Load config if we don't provide a configuration if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( - config_path, *model_args, - cache_dir=cache_dir, return_unused_kwargs=True, + config_path, + *model_args, + cache_dir=cache_dir, + return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, **kwargs @@ -263,9 +275,11 @@ class TFPreTrainedModel(tf.keras.Model): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: - raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format( - [WEIGHTS_NAME, TF2_WEIGHTS_NAME], - pretrained_model_name_or_path)) + raise EnvironmentError( + "Error no file named {} found in directory {} or `from_pt` set to False".format( + [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path + ) + ) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): @@ -273,31 +287,37 @@ class TFPreTrainedModel(tf.keras.Model): else: archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME) if from_pt: - raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.") + raise EnvironmentError( + "Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name." + ) # redirect to the cache, if necessary try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, - resume_download=resume_download, proxies=proxies) + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + ) except EnvironmentError as e: if pretrained_model_name_or_path in cls.pretrained_model_archive_map: - logger.error( - "Couldn't reach server at '{}' to download pretrained weights.".format( - archive_file)) + logger.error("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file)) else: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name_or_path, - ', '.join(cls.pretrained_model_archive_map.keys()), - archive_file)) + ", ".join(cls.pretrained_model_archive_map.keys()), + archive_file, + ) + ) raise e if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: - logger.info("loading weights file {} from cache at {}".format( - archive_file, resolved_archive_file)) + logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) else: resolved_archive_file = None @@ -316,38 +336,42 @@ class TFPreTrainedModel(tf.keras.Model): try: model.load_weights(resolved_archive_file, by_name=True) except OSError: - raise OSError("Unable to load weights from h5 file. " - "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. ") + raise OSError( + "Unable to load weights from h5 file. " + "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. " + ) ret = model(model.dummy_inputs, training=False) # Make sure restore ops are run # Check if the models are the same to output loading informations - with h5py.File(resolved_archive_file, 'r') as f: - if 'layer_names' not in f.attrs and 'model_weights' in f: - f = f['model_weights'] - hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, 'layer_names')) + with h5py.File(resolved_archive_file, "r") as f: + if "layer_names" not in f.attrs and "model_weights" in f: + f = f["model_weights"] + hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names")) model_layer_names = set(layer.name for layer in model.layers) missing_keys = list(model_layer_names - hdf5_layer_names) unexpected_keys = list(hdf5_layer_names - model_layer_names) error_msgs = [] if len(missing_keys) > 0: - logger.info("Layers of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, missing_keys)) + logger.info( + "Layers of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys) + ) if len(unexpected_keys) > 0: - logger.info("Layers from pretrained model not used in {}: {}".format( - model.__class__.__name__, unexpected_keys)) + logger.info( + "Layers from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys) + ) if len(error_msgs) > 0: - raise RuntimeError('Error(s) in loading weights for {}:\n\t{}'.format( - model.__class__.__name__, "\n\t".join(error_msgs))) + raise RuntimeError( + "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)) + ) if output_loading_info: - loading_info = {"missing_keys": missing_keys, - "unexpected_keys": unexpected_keys, - "error_msgs": error_msgs} + loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs} return model, loading_info return model + class TFConv1D(tf.keras.layers.Layer): def __init__(self, nf, nx, initializer_range=0.02, **kwargs): """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) @@ -360,13 +384,9 @@ class TFConv1D(tf.keras.layers.Layer): def build(self, input_shape): self.weight = self.add_weight( - "weight", - shape=[self.nx, self.nf], - initializer=get_initializer(self.initializer_range)) - self.bias = self.add_weight( - "bias", - shape=[1, self.nf], - initializer=tf.zeros_initializer()) + "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) + ) + self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) def call(self, x): bz, sl = shape_list(x)[:2] @@ -382,11 +402,12 @@ class TFConv1D(tf.keras.layers.Layer): class TFSharedEmbeddings(tf.keras.layers.Layer): """Construct shared token embeddings. """ + def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs): super(TFSharedEmbeddings, self).__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range def build(self, input_shape): """Build shared word embedding layer @@ -394,9 +415,8 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ self.weight = self.add_weight( - "weight", - shape=[self.vocab_size, self.hidden_size], - initializer=get_initializer(self.initializer_range)) + "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) + ) super(TFSharedEmbeddings, self).build(input_shape) def call(self, inputs, mode="embedding"): @@ -455,35 +475,36 @@ class TFSequenceSummary(tf.keras.layers.Layer): summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ + def __init__(self, config, initializer_range=0.02, **kwargs): super(TFSequenceSummary, self).__init__(**kwargs) - self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last' - if self.summary_type == 'attn': + self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" + if self.summary_type == "attn": # We should use a standard multi-head attention module with absolute positional embedding for that. # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # We can probably just use the multi-head attention module of PyTorch >=1.1.0 raise NotImplementedError - self.has_summary = hasattr(config, 'summary_use_proj') and config.summary_use_proj + self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj if self.has_summary: - if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: num_classes = config.num_labels else: num_classes = config.hidden_size - self.summary = tf.keras.layers.Dense(num_classes, - kernel_initializer=get_initializer(initializer_range), - name='summary') + self.summary = tf.keras.layers.Dense( + num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" + ) - self.has_activation = hasattr(config, 'summary_activation') and config.summary_activation == 'tanh' + self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh" if self.has_activation: self.activation = tf.keras.activations.tanh - self.has_first_dropout = hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0 + self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 if self.has_first_dropout: self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout) - self.has_last_dropout = hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0 + self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 if self.has_last_dropout: self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout) @@ -502,29 +523,33 @@ class TFSequenceSummary(tf.keras.layers.Layer): cls_index = inputs[1] if len(inputs) > 1 else None assert len(inputs) <= 2, "Too many inputs." else: - input_ids = inputs.get('input_ids') - cls_index = inputs.get('cls_index', None) + input_ids = inputs.get("input_ids") + cls_index = inputs.get("cls_index", None) - if self.summary_type == 'last': + if self.summary_type == "last": output = hidden_states[:, -1] - elif self.summary_type == 'first': + elif self.summary_type == "first": output = hidden_states[:, 0] - elif self.summary_type == 'mean': + elif self.summary_type == "mean": output = tf.reduce_mean(hidden_states, axis=1) - elif self.summary_type == 'cls_index': + elif self.summary_type == "cls_index": hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims] if cls_index is None: - cls_index = tf.fill(hidden_shape[:-2], hidden_shape[-2] - 1) # A tensor full of shape [batch] or [batch, num choices] full of sequence length + cls_index = tf.fill( + hidden_shape[:-2], hidden_shape[-2] - 1 + ) # A tensor full of shape [batch] or [batch, num choices] full of sequence length cls_shape = shape_list(cls_index) if len(cls_shape) <= len(hidden_shape) - 2: cls_index = cls_index[..., tf.newaxis] # else: - # cls_index = cls_index[..., tf.newaxis] - # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) + # cls_index = cls_index[..., tf.newaxis] + # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2) - output = tf.squeeze(output, axis=len(hidden_shape) - 2) # shape of output: (batch, num choices, hidden_size) - elif self.summary_type == 'attn': + output = tf.squeeze( + output, axis=len(hidden_shape) - 2 + ) # shape of output: (batch, num choices, hidden_size) + elif self.summary_type == "attn": raise NotImplementedError if self.has_first_dropout: @@ -541,12 +566,14 @@ class TFSequenceSummary(tf.keras.layers.Layer): return output + def shape_list(x): """Deal with dynamic shape in tensorflow cleanly.""" static = x.shape.as_list() dynamic = tf.shape(x) return [dynamic[i] if s is None else s for i, s in enumerate(static)] + def get_initializer(initializer_range=0.02): """Creates a `tf.initializers.truncated_normal` with the given range. Args: diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py index a7cc8ea481..2f443ae2fc 100644 --- a/transformers/modeling_tf_xlm.py +++ b/transformers/modeling_tf_xlm.py @@ -25,30 +25,34 @@ import numpy as np import tensorflow as tf from .configuration_xlm import XLMConfig -from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer, DUMMY_INPUTS +from .modeling_tf_utils import ( + TFPreTrainedModel, + TFSharedEmbeddings, + TFSequenceSummary, + shape_list, + get_initializer, + DUMMY_INPUTS, +) from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5", - 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5", - 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5", - 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5", - 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5", - 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5", - 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5", - 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5", - 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5", - 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5", + "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5", + "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5", + "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5", + "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5", + "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5", + "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5", + "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5", + "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5", + "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5", + "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5", } def create_sinusoidal_embeddings(n_pos, dim, out): - position_enc = np.array([ - [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] - for pos in range(n_pos) - ]) + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2])) out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2])) @@ -78,8 +82,9 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32): # attention mask is the same as mask, or triangular inferior attention (causal) if causal: - attn_mask = tf.less_equal(tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), - alen[tf.newaxis, :, tf.newaxis]) + attn_mask = tf.less_equal( + tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), alen[tf.newaxis, :, tf.newaxis] + ) else: attn_mask = mask @@ -106,10 +111,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): self.n_heads = n_heads assert self.dim % self.n_heads == 0 - self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin') - self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin') - self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin') - self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin') + self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin") + self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin") + self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin") + self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin") self.dropout = tf.keras.layers.Dropout(config.attention_dropout) self.pruned_heads = set() @@ -125,7 +130,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = shape_list(input) if kv is None: - klen = qlen if cache is None else cache['slen'] + qlen + klen = qlen if cache is None else cache["slen"] + qlen else: klen = shape_list(kv)[1] # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) @@ -141,40 +146,40 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): """ compute context """ return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) - q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) + q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: - k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv - k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] - k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) - v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) + k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) + v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) - q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) - scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) - mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) + q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) + mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) scores = scores - 1e30 * (1.0 - mask) - weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) - weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask - context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) - context = unshape(context) # (bs, qlen, dim) + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) outputs = (self.out_lin(context),) if self.output_attentions: @@ -183,11 +188,10 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): class TFTransformerFFN(tf.keras.layers.Layer): - def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): super(TFTransformerFFN, self).__init__(**kwargs) - self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1') - self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2') + self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") + self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu self.dropout = tf.keras.layers.Dropout(config.dropout) @@ -226,30 +230,36 @@ class TFXLMMainLayer(tf.keras.layers.Layer): # assert len(self.id2lang) == len(self.lang2id) == self.n_langs # model parameters - self.dim = config.emb_dim # 512 by default + self.dim = config.emb_dim # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default - self.n_heads = config.n_heads # 8 by default + self.n_heads = config.n_heads # 8 by default self.n_layers = config.n_layers - assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads' + assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" # embeddings self.dropout = tf.keras.layers.Dropout(config.dropout) self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) - self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, - self.dim, - embeddings_initializer=get_initializer(config.embed_init_std), - name='position_embeddings') + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + self.dim, + embeddings_initializer=get_initializer(config.embed_init_std), + name="position_embeddings", + ) if config.sinusoidal_embeddings: raise NotImplementedError # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) if config.n_langs > 1 and config.use_lang_emb: - self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs, - self.dim, - embeddings_initializer=get_initializer(config.embed_init_std), - name='lang_embeddings') - self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings') # padding_idx=self.pad_index) - self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb') + self.lang_embeddings = tf.keras.layers.Embedding( + self.n_langs, + self.dim, + embeddings_initializer=get_initializer(config.embed_init_std), + name="lang_embeddings", + ) + self.embeddings = TFSharedEmbeddings( + self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings" + ) # padding_idx=self.pad_index) + self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb") # transformer layers self.attentions = [] @@ -261,13 +271,21 @@ class TFXLMMainLayer(tf.keras.layers.Layer): # self.encoder_attn = [] for i in range(self.n_layers): - self.attentions.append(TFMultiHeadAttention(self.n_heads, self.dim, config=config, name='attentions_._{}'.format(i))) - self.layer_norm1.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm1_._{}'.format(i))) + self.attentions.append( + TFMultiHeadAttention(self.n_heads, self.dim, config=config, name="attentions_._{}".format(i)) + ) + self.layer_norm1.append( + tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1_._{}".format(i)) + ) # if self.is_decoder: # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) - self.ffns.append(TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name='ffns_._{}'.format(i))) - self.layer_norm2.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm2_._{}'.format(i))) + self.ffns.append( + TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name="ffns_._{}".format(i)) + ) + self.layer_norm2.append( + tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2_._{}".format(i)) + ) if hasattr(config, "pruned_heads"): pruned_heads = config.pruned_heads.copy().items() @@ -276,7 +294,6 @@ class TFXLMMainLayer(tf.keras.layers.Layer): if self.attentions[int(layer)].n_heads == config.n_heads: self.prune_heads({int(layer): list(map(int, heads))}) - def get_input_embeddings(self): return self.embeddings @@ -290,9 +307,19 @@ class TFXLMMainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None, - position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, - training=False): # removed: src_enc=None, src_len=None + def call( + self, + inputs, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -305,15 +332,15 @@ class TFXLMMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - langs = inputs.get('langs', langs) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - lengths = inputs.get('lengths', lengths) - cache = inputs.get('cache', cache) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + langs = inputs.get("langs", langs) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + lengths = inputs.get("lengths", lengths) + cache = inputs.get("cache", cache) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs @@ -331,7 +358,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): if input_ids is not None: lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) else: - lengths = tf.convert_to_tensor([slen]*bs, tf.int32) + lengths = tf.convert_to_tensor([slen] * bs, tf.int32) # mask = input_ids != self.pad_index # check inputs @@ -375,7 +402,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): # do not recompute cached elements if cache is not None and input_ids is not None: - _slen = slen - cache['slen'] + _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: @@ -430,7 +457,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): # update cache length if cache is not None: - cache['slen'] += tensor.size(1) + cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) @@ -447,6 +474,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = XLMConfig pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -460,7 +488,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel): langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) else: langs_list = None - return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list} + return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} XLM_START_DOCSTRING = r""" The XLM model was proposed in @@ -554,8 +582,12 @@ XLM_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class TFXLMModel(TFXLMPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -581,20 +613,21 @@ class TFXLMModel(TFXLMPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFXLMModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXLMMainLayer(config, name='transformer') + self.transformer = TFXLMMainLayer(config, name="transformer") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) return outputs - class TFXLMPredLayer(tf.keras.layers.Layer): """ Prediction layer (cross_entropy or adaptive_softmax). """ + def __init__(self, config, input_embeddings, **kwargs): super(TFXLMPredLayer, self).__init__(**kwargs) self.asm = config.asm @@ -614,10 +647,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer): def build(self, input_shape): # The output weights are the same as the input embeddings, but there is an output-only bias for each token. - self.bias = self.add_weight(shape=(self.n_words,), - initializer='zeros', - trainable=True, - name='bias') + self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias") super(TFXLMPredLayer, self).build(input_shape) def call(self, hidden_states): @@ -626,9 +656,12 @@ class TFXLMPredLayer(tf.keras.layers.Layer): return hidden_states -@add_start_docstrings("""The XLM Model transformer with a language modeling head on top +@add_start_docstrings( + """The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -654,10 +687,11 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXLMMainLayer(config, name='transformer') - self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj') + self.transformer = TFXLMMainLayer(config, name="transformer") + self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") def get_output_embeddings(self): return self.pred_layer.input_embeddings @@ -672,9 +706,12 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): return outputs -@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class TFXLMForSequenceClassification(TFXLMPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -701,12 +738,13 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.transformer = TFXLMMainLayer(config, name='transformer') - self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary') + self.transformer = TFXLMMainLayer(config, name="transformer") + self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) @@ -718,9 +756,12 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): return outputs -@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -748,12 +789,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): start_scores, end_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXLMMainLayer(config, name='transformer') - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.init_std), - name='qa_outputs') + self.transformer = TFXLMMainLayer(config, name="transformer") + self.qa_outputs = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" + ) def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) @@ -765,6 +807,8 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + outputs = (start_logits, end_logits,) + transformer_outputs[ + 1: + ] # Keep mems, hidden states, attentions if there are in it return outputs # start_logits, end_logits, (hidden_states), (attentions) diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py index 2f1fe150c6..c1ed720f96 100644 --- a/transformers/modeling_tf_xlnet.py +++ b/transformers/modeling_tf_xlnet.py @@ -35,8 +35,8 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5", - 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5", + "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5", + "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5", } @@ -45,8 +45,7 @@ def gelu(x): XLNet is using OpenAI GPT's gelu Also see https://arxiv.org/abs/1606.08415 """ - cdf = 0.5 * (1.0 + tf.tanh( - (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf @@ -54,9 +53,11 @@ def swish(x): return x * tf.sigmoid(x) -ACT2FN = {"gelu": tf.keras.layers.Activation(gelu), - "relu": tf.keras.activations.relu, - "swish": tf.keras.layers.Activation(swish)} +ACT2FN = { + "gelu": tf.keras.layers.Activation(gelu), + "relu": tf.keras.activations.relu, + "swish": tf.keras.layers.Activation(swish), +} class TFXLNetRelativeAttention(tf.keras.layers.Layer): @@ -67,7 +68,8 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): if config.d_model % config.n_head != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.d_model, config.n_head)) + "heads (%d)" % (config.d_model, config.n_head) + ) self.n_head = config.n_head self.d_head = config.d_head @@ -75,38 +77,38 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): self.scale = 1 / (config.d_head ** 0.5) self.initializer_range = config.initializer_range - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm') + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): initializer = get_initializer(self.initializer_range) - self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head), - initializer=initializer, - trainable=True, name='q') - self.k = self.add_weight(shape=(self.d_model, self.n_head, self.d_head), - initializer=initializer, - trainable=True, name='k') - self.v = self.add_weight(shape=(self.d_model, self.n_head, self.d_head), - initializer=initializer, - trainable=True, name='v') - self.o = self.add_weight(shape=(self.d_model, self.n_head, self.d_head), - initializer=initializer, - trainable=True, name='o') - self.r = self.add_weight(shape=(self.d_model, self.n_head, self.d_head), - initializer=initializer, - trainable=True, name='r') - self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer='zeros', - trainable=True, name='r_r_bias') - self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer='zeros', - trainable=True, name='r_s_bias') - self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head), - initializer='zeros', - trainable=True, name='r_w_bias') - self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head), - initializer=initializer, - trainable=True, name='seg_embed') + self.q = self.add_weight( + shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q" + ) + self.k = self.add_weight( + shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k" + ) + self.v = self.add_weight( + shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v" + ) + self.o = self.add_weight( + shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o" + ) + self.r = self.add_weight( + shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r" + ) + self.r_r_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" + ) + self.r_s_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias" + ) + self.r_w_bias = self.add_weight( + shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" + ) + self.seg_embed = self.add_weight( + shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" + ) super(TFXLNetRelativeAttention, self).build(input_shape) def prune_heads(self, heads): @@ -130,18 +132,18 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs # content based attention score - ac = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_w_bias, k_head_h) + ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h) # position based attention score - bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r) + bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r) bd = self.rel_shift(bd, klen=shape_list(ac)[1]) # segment based attention score if seg_mat is None: ef = 0 else: - ef = tf.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed) - ef = tf.einsum('ijbs,ibns->ijbn', seg_mat, ef) + ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed) + ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef) # merge attention scores and perform masking attn_score = (ac + bd + ef) * self.scale @@ -162,7 +164,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): attn_prob = attn_prob * head_mask # attention output - attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h) + attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h) if self.output_attentions: return attn_vec, attn_prob @@ -174,7 +176,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): # post-attention projection (back to `d_model`) h, attn_vec = inputs - attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o) + attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o) attn_out = self.dropout(attn_out, training=training) @@ -185,8 +187,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): return output def call(self, inputs, training=False): - (h, g, attn_mask_h, attn_mask_g, - r, seg_mat, mems, target_mapping, head_mask) = inputs + (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs if g is not None: ###### Two-stream attention with relative positional encoding. @@ -197,22 +198,22 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): cat = h # content-based key head - k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k) + k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k) # content-based value head - v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v) + v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v) # position-based key head - k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r) + k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r) ##### h-stream # content-stream query head - q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q) + q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q) # core attention ops attn_vec_h = self.rel_attn_core( - [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], - training=training) + [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training + ) if self.output_attentions: attn_vec_h, attn_prob_h = attn_vec_h @@ -222,23 +223,23 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ##### g-stream # query-stream query head - q_head_g = tf.einsum('ibh,hnd->ibnd', g, self.q) + q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q) # core attention ops if target_mapping is not None: - q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) + q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( - [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], - training=training) + [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training + ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g - attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) + attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( - [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], - training=training) + [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training + ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g @@ -257,17 +258,17 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): cat = h # content heads - q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q) - k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k) - v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v) + q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q) + k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k) + v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v) # positional heads - k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r) + k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r) # core attention ops attn_vec = self.rel_attn_core( - [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], - training=training) + [q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask], training=training + ) if self.output_attentions: attn_vec, attn_prob = attn_vec @@ -281,19 +282,21 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): outputs = outputs + (attn_prob,) return outputs + class TFXLNetFeedForward(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFXLNetFeedForward, self).__init__(**kwargs) - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm') - self.layer_1 = tf.keras.layers.Dense(config.d_inner, - kernel_initializer=get_initializer(config.initializer_range), - name='layer_1') - self.layer_2 = tf.keras.layers.Dense(config.d_model, - kernel_initializer=get_initializer(config.initializer_range), - name='layer_2') + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.layer_1 = tf.keras.layers.Dense( + config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1" + ) + self.layer_2 = tf.keras.layers.Dense( + config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2" + ) self.dropout = tf.keras.layers.Dropout(config.dropout) - if isinstance(config.ff_activation, str) or \ - (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)): + if isinstance(config.ff_activation, str) or ( + sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode) + ): self.activation_function = ACT2FN[config.ff_activation] else: self.activation_function = config.ff_activation @@ -308,11 +311,12 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): output = self.layer_norm(output + inp) return output + class TFXLNetLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFXLNetLayer, self).__init__(**kwargs) - self.rel_attn = TFXLNetRelativeAttention(config, name='rel_attn') - self.ff = TFXLNetFeedForward(config, name='ff') + self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn") + self.ff = TFXLNetFeedForward(config, name="ff") self.dropout = tf.keras.layers.Dropout(config.dropout) def call(self, inputs, training=False): @@ -336,10 +340,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): self.input_embeddings = input_embeddings def build(self, input_shape): - self.bias = self.add_weight(shape=(self.vocab_size,), - initializer='zeros', - trainable=True, - name='bias') + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") super(TFXLNetLMHead, self).build(input_shape) def call(self, hidden_states): @@ -366,8 +367,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): self.use_bfloat16 = config.use_bfloat16 self.initializer_range = config.initializer_range - self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding') - self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)] + self.word_embedding = TFSharedEmbeddings( + config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding" + ) + self.layer = [TFXLNetLayer(config, name="layer_._{}".format(i)) for i in range(config.n_layer)] self.dropout = tf.keras.layers.Dropout(config.dropout) def get_input_embeddings(self): @@ -375,9 +378,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): def build(self, input_shape): initializer = get_initializer(self.initializer_range) - self.mask_emb = self.add_weight(shape=(1, 1, self.d_model), - initializer=initializer, - trainable=True, name='mask_emb') + self.mask_emb = self.add_weight( + shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb" + ) def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError @@ -417,18 +420,18 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): def cache_mem(self, curr_out, prev_mem): """cache hidden states into memory.""" if self.reuse_len is not None and self.reuse_len > 0: - curr_out = curr_out[:self.reuse_len] + curr_out = curr_out[: self.reuse_len] if prev_mem is None: - new_mem = curr_out[-self.mem_len:] + new_mem = curr_out[-self.mem_len :] else: - new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len:] + new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len :] return tf.stop_gradient(new_mem) @staticmethod def positional_embedding(pos_seq, inv_freq, bsz=None): - sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq) + sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq) pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1) pos_emb = pos_emb[:, None, :] @@ -444,14 +447,14 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): freq_seq = tf.cast(freq_seq, dtype=dtype) inv_freq = 1 / (10000 ** (freq_seq / self.d_model)) - if self.attn_type == 'bi': + if self.attn_type == "bi": # beg, end = klen - 1, -qlen beg, end = klen, -qlen - elif self.attn_type == 'uni': + elif self.attn_type == "uni": # beg, end = klen - 1, -1 beg, end = klen, -1 else: - raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type)) + raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) if self.bi_data: fwd_pos_seq = tf.range(beg, end, -1.0) @@ -467,9 +470,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): if bsz is not None: # With bi_data, the batch size should be divisible by 2. - assert bsz%2 == 0 - fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2) - bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2) + assert bsz % 2 == 0 + fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2) + bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2) else: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) @@ -485,8 +488,19 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): return pos_emb - def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, training=False): + def call( + self, + inputs, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -499,15 +513,15 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - mems = inputs.get('mems', mems) - perm_mask = inputs.get('perm_mask', perm_mask) - target_mapping = inputs.get('target_mapping', target_mapping) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - input_mask = inputs.get('input_mask', input_mask) - head_mask = inputs.get('head_mask', head_mask) - inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + mems = inputs.get("mems", mems) + perm_mask = inputs.get("perm_mask", perm_mask) + target_mapping = inputs.get("target_mapping", target_mapping) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + input_mask = inputs.get("input_mask", input_mask) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs @@ -540,17 +554,19 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ##### Attention mask # causal attention mask - if self.attn_type == 'uni': + if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = attn_mask[:, :, None, None] - elif self.attn_type == 'bi': + elif self.attn_type == "bi": attn_mask = None else: - raise ValueError('Unsupported attention type: {}'.format(self.attn_type)) + raise ValueError("Unsupported attention type: {}".format(self.attn_type)) # data mask: input mask & perm mask - assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " \ + assert input_mask is None or attention_mask is None, ( + "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." + ) if input_mask is None and attention_mask is not None: input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float) if input_mask is not None and perm_mask is not None: @@ -564,8 +580,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): if data_mask is not None: # all mems can be attended to - mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], - dtype=dtype_float) + mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], dtype=dtype_float) data_mask = tf.concat([mems_mask, data_mask], axis=1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] @@ -590,9 +605,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): output_h = self.dropout(word_emb_k, training=training) if target_mapping is not None: word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1]) - # else: # We removed the inp_q input which was same as target mapping - # inp_q_ext = inp_q[:, :, None] - # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k + # else: # We removed the inp_q input which was same as target mapping + # inp_q_ext = inp_q[:, :, None] + # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k output_g = self.dropout(word_emb_q, training=training) else: output_g = None @@ -604,9 +619,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): cat_ids = tf.concat([mem_pad, token_type_ids], 0) # `1` indicates not in the same segment [qlen x klen x bsz] - seg_mat = tf.cast( - tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])), - tf.int32) + seg_mat = tf.cast(tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])), tf.int32) seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float) else: seg_mat = None @@ -626,7 +639,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer @@ -643,9 +658,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) - outputs = layer_module([output_h, output_g, non_tgt_mask, attn_mask, - pos_emb, seg_mat, mems[i], target_mapping, - head_mask[i]], training=training) + outputs = layer_module( + [output_h, output_g, non_tgt_mask, attn_mask, pos_emb, seg_mat, mems[i], target_mapping, head_mask[i]], + training=training, + ) output_h, output_g = outputs[:2] if self.output_attentions: attentions.append(outputs[2]) @@ -679,6 +695,7 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = XLNetConfig pretrained_model_archive_map = TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" @@ -784,8 +801,12 @@ XLNET_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.", - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.", + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class TFXLNetModel(TFXLNetPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -816,18 +837,22 @@ class TFXLNetModel(TFXLNetPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config, *inputs, **kwargs): super(TFXLNetModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXLNetMainLayer(config, name='transformer') + self.transformer = TFXLNetMainLayer(config, name="transformer") def call(self, inputs, **kwargs): outputs = self.transformer(inputs, **kwargs) return outputs -@add_start_docstrings("""XLNet Model with a language modeling head on top +@add_start_docstrings( + """XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings). """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -865,10 +890,11 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ + def __init__(self, config, *inputs, **kwargs): super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXLNetMainLayer(config, name='transformer') - self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss') + self.transformer = TFXLNetMainLayer(config, name="transformer") + self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss") def get_output_embeddings(self): return self.lm_loss.input_embeddings @@ -883,9 +909,12 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): return outputs # return logits, (mems), (hidden states), (attentions) -@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -916,15 +945,18 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): logits = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.transformer = TFXLNetMainLayer(config, name='transformer') - self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary') - self.logits_proj = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='logits_proj') + self.transformer = TFXLNetMainLayer(config, name="transformer") + self.sequence_summary = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="sequence_summary" + ) + self.logits_proj = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" + ) def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) @@ -938,9 +970,12 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): return outputs # return logits, (mems), (hidden states), (attentions) -@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -971,14 +1006,15 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): scores = outputs[0] """ + def __init__(self, config, *inputs, **kwargs): super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels - self.transformer = TFXLNetMainLayer(config, name='transformer') - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') + self.transformer = TFXLNetMainLayer(config, name="transformer") + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) @@ -1027,12 +1063,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): start_scores, end_scores = outputs[:2] """ + def __init__(self, config, *inputs, **kwargs): super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXLNetMainLayer(config, name='transformer') - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='qa_outputs') + self.transformer = TFXLNetMainLayer(config, name="transformer") + self.qa_outputs = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) def call(self, inputs, **kwargs): transformer_outputs = self.transformer(inputs, **kwargs) @@ -1044,10 +1081,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + outputs = (start_logits, end_logits,) + transformer_outputs[ + 1: + ] # Keep mems, hidden states, attentions if there are in it return outputs # start_logits, end_logits, (mems), (hidden_states), (attentions) + # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # the hidden-states output to compute `span start logits` and `span end logits`). """, # XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py index 70ef4aea3e..cee61ed37a 100644 --- a/transformers/modeling_transfo_xl.py +++ b/transformers/modeling_transfo_xl.py @@ -42,65 +42,62 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin", + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin", } + def build_tf_to_pytorch_map(model, config): """ A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible. """ tf_to_pt_map = {} - if hasattr(model, 'transformer'): + if hasattr(model, "transformer"): # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax - tf_to_pt_map.update({ - "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight, - "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias}) - for i, (out_l, proj_l, tie_proj) in enumerate(zip( - model.crit.out_layers, - model.crit.out_projs, - config.tie_projs)): + tf_to_pt_map.update( + { + "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight, + "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias, + } + ) + for i, (out_l, proj_l, tie_proj) in enumerate( + zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs) + ): layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i if config.tie_weight: - tf_to_pt_map.update({ - layer_str + 'b': out_l.bias}) + tf_to_pt_map.update({layer_str + "b": out_l.bias}) else: raise NotImplementedError # I don't think this is implemented in the TF code - tf_to_pt_map.update({ - layer_str + 'lookup_table': out_l.weight, - layer_str + 'b': out_l.bias}) + tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias}) if not tie_proj: - tf_to_pt_map.update({ - layer_str + 'proj': proj_l - }) + tf_to_pt_map.update({layer_str + "proj": proj_l}) # Now load the rest of the transformer model = model.transformer # Embeddings for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)): layer_str = "transformer/adaptive_embed/cutoff_%d/" % i - tf_to_pt_map.update({ - layer_str + 'lookup_table': embed_l.weight, - layer_str + 'proj_W': proj_l - }) + tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l}) # Transformer blocks for i, b in enumerate(model.layers): layer_str = "transformer/layer_%d/" % i - tf_to_pt_map.update({ - layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, - layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias, - layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight, - layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight, - layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight, - layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight, - layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias, - layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight, - layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias, - layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight, - layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias, - }) + tf_to_pt_map.update( + { + layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, + layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias, + layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight, + layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight, + layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight, + layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight, + layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias, + layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight, + layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias, + layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight, + layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias, + } + ) # Relative positioning biases if config.untie_r: @@ -112,11 +109,10 @@ def build_tf_to_pytorch_map(model, config): else: r_r_list = [model.r_r_bias] r_w_list = [model.r_w_bias] - tf_to_pt_map.update({ - 'transformer/r_r_bias': r_r_list, - 'transformer/r_w_bias': r_w_list}) + tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list}) return tf_to_pt_map + def load_tf_weights_in_transfo_xl(model, config, tf_path): """ Load tf checkpoints in a pytorch model """ @@ -124,8 +120,10 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): import numpy as np import tensorflow as tf except ImportError: - logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) raise # Build TF to PyTorch weights loading map tf_to_pt_map = build_tf_to_pytorch_map(model, config) @@ -143,9 +141,9 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): array = tf_weights[name] # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model - if 'kernel' in name or 'proj' in name: + if "kernel" in name or "proj" in name: array = np.transpose(array) - if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1: + if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1: # Here we will split the TF weigths assert len(pointer) == array.shape[0] for i, p_i in enumerate(pointer): @@ -166,10 +164,10 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) tf_weights.pop(name, None) - tf_weights.pop(name + '/Adam', None) - tf_weights.pop(name + '/Adam_1', None) + tf_weights.pop(name + "/Adam", None) + tf_weights.pop(name + "/Adam_1", None) - logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) return model @@ -180,17 +178,16 @@ class PositionalEmbedding(nn.Module): self.demb = demb inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) - self.register_buffer('inv_freq', inv_freq) + self.register_buffer("inv_freq", inv_freq) def forward(self, pos_seq, bsz=None): sinusoid_inp = torch.ger(pos_seq, self.inv_freq) pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) if bsz is not None: - return pos_emb[:,None,:].expand(-1, bsz, -1) + return pos_emb[:, None, :].expand(-1, bsz, -1) else: - return pos_emb[:,None,:] - + return pos_emb[:, None, :] class PositionwiseFF(nn.Module): @@ -202,7 +199,8 @@ class PositionwiseFF(nn.Module): self.dropout = dropout self.CoreNet = nn.Sequential( - nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), + nn.Linear(d_model, d_inner), + nn.ReLU(inplace=True), nn.Dropout(dropout), nn.Linear(d_inner, d_model), nn.Dropout(dropout), @@ -230,10 +228,22 @@ class PositionwiseFF(nn.Module): class RelPartialLearnableMultiHeadAttn(nn.Module): - def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, - tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, - r_r_bias=None, r_w_bias=None, output_attentions=False, - layer_norm_epsilon=1e-5): + def __init__( + self, + n_head, + d_model, + d_head, + dropout, + dropatt=0, + tgt_len=None, + ext_len=None, + mem_len=None, + pre_lnorm=False, + r_r_bias=None, + r_w_bias=None, + output_attentions=False, + layer_norm_epsilon=1e-5, + ): super(RelPartialLearnableMultiHeadAttn, self).__init__() self.output_attentions = output_attentions @@ -254,7 +264,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): self.pre_lnorm = pre_lnorm - if r_r_bias is None or r_w_bias is None: # Biases are not shared + if r_r_bias is None or r_w_bias is None: # Biases are not shared self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) else: @@ -299,18 +309,18 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): klen = w_head_k.size(0) - w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head - w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head - w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head - r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head + r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head #### compute attention score - rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head - AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head + AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head rr_head_q = w_head_q + self.r_r_bias - BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head + BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head BD = self._rel_shift(BD) # [qlen x klen x bsz x n_head] @@ -319,21 +329,19 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): #### compute attention probability if attn_mask is not None and torch.sum(attn_mask).item(): - attn_mask = (attn_mask == 1) # Switch to bool + attn_mask = attn_mask == 1 # Switch to bool if attn_mask.dim() == 2: if next(self.parameters()).dtype == torch.float16: - attn_score = attn_score.float().masked_fill( - attn_mask[None,:,:,None], -65000).type_as(attn_score) + attn_score = ( + attn_score.float().masked_fill(attn_mask[None, :, :, None], -65000).type_as(attn_score) + ) else: - attn_score = attn_score.float().masked_fill( - attn_mask[None,:,:,None], -1e30).type_as(attn_score) + attn_score = attn_score.float().masked_fill(attn_mask[None, :, :, None], -1e30).type_as(attn_score) elif attn_mask.dim() == 3: if next(self.parameters()).dtype == torch.float16: - attn_score = attn_score.float().masked_fill( - attn_mask[:,:,:,None], -65000).type_as(attn_score) + attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -65000).type_as(attn_score) else: - attn_score = attn_score.float().masked_fill( - attn_mask[:,:,:,None], -1e30).type_as(attn_score) + attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score) # [qlen x klen x bsz x n_head] attn_prob = F.softmax(attn_score, dim=1) @@ -344,11 +352,10 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): attn_prob = attn_prob * head_mask #### compute attention vector - attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v)) # [qlen x bsz x n_head x d_head] - attn_vec = attn_vec.contiguous().view( - attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) ##### linear projection attn_out = self.o_net(attn_vec) @@ -368,21 +375,19 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): class RelPartialLearnableDecoderLayer(nn.Module): - def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, - **kwargs): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs): super(RelPartialLearnableDecoderLayer, self).__init__() - self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model, - d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs) - self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, - pre_lnorm=kwargs.get('pre_lnorm'), - layer_norm_epsilon=layer_norm_epsilon) + self.dec_attn = RelPartialLearnableMultiHeadAttn( + n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs + ) + self.pos_ff = PositionwiseFF( + d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon + ) def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None): - attn_outputs = self.dec_attn(dec_inp, r, - attn_mask=dec_attn_mask, - mems=mems, head_mask=head_mask) + attn_outputs = self.dec_attn(dec_inp, r, attn_mask=dec_attn_mask, mems=mems, head_mask=head_mask) ff_output = self.pos_ff(attn_outputs[0]) outputs = [ff_output] + attn_outputs[1:] @@ -391,8 +396,7 @@ class RelPartialLearnableDecoderLayer(nn.Module): class AdaptiveEmbedding(nn.Module): - def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, - sample_softmax=False): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False): super(AdaptiveEmbedding, self).__init__() self.n_token = n_token @@ -409,28 +413,25 @@ class AdaptiveEmbedding(nn.Module): self.emb_layers = nn.ModuleList() self.emb_projs = nn.ParameterList() if div_val == 1: - self.emb_layers.append( - nn.Embedding(n_token, d_embed, sparse=sample_softmax>0) - ) + self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0)) if d_proj != d_embed: self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) else: for i in range(len(self.cutoffs)): - l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) - self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i)) + self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i)) self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) def forward(self, inp): if self.div_val == 1: embed = self.emb_layers[0](inp) if self.d_proj != self.d_embed: - embed = F.linear(embed, self.emb_projs[0]) + embed = F.linear(embed, self.emb_projs[0]) else: param = next(self.parameters()) inp_flat = inp.view(-1) - emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], - dtype=param.dtype, device=param.device) + emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device) for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] @@ -458,15 +459,16 @@ class TransfoXLPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = TransfoXLConfig pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_transfo_xl base_model_prefix = "transformer" def _init_weight(self, weight): - if self.config.init == 'uniform': + if self.config.init == "uniform": nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) - elif self.config.init == 'normal': + elif self.config.init == "normal": nn.init.normal_(weight, 0.0, self.config.init_std) def _init_bias(self, bias): @@ -476,41 +478,41 @@ class TransfoXLPreTrainedModel(PreTrainedModel): """ Initialize the weights. """ classname = m.__class__.__name__ - if classname.find('Linear') != -1: - if hasattr(m, 'weight') and m.weight is not None: + if classname.find("Linear") != -1: + if hasattr(m, "weight") and m.weight is not None: self._init_weight(m.weight) - if hasattr(m, 'bias') and m.bias is not None: + if hasattr(m, "bias") and m.bias is not None: self._init_bias(m.bias) - elif classname.find('AdaptiveEmbedding') != -1: - if hasattr(m, 'emb_projs'): + elif classname.find("AdaptiveEmbedding") != -1: + if hasattr(m, "emb_projs"): for i in range(len(m.emb_projs)): if m.emb_projs[i] is not None: nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std) - elif classname.find('Embedding') != -1: - if hasattr(m, 'weight'): + elif classname.find("Embedding") != -1: + if hasattr(m, "weight"): self._init_weight(m.weight) - elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: - if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + elif classname.find("ProjectedAdaptiveLogSoftmax") != -1: + if hasattr(m, "cluster_weight") and m.cluster_weight is not None: self._init_weight(m.cluster_weight) - if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + if hasattr(m, "cluster_bias") and m.cluster_bias is not None: self._init_bias(m.cluster_bias) - if hasattr(m, 'out_projs'): + if hasattr(m, "out_projs"): for i in range(len(m.out_projs)): if m.out_projs[i] is not None: nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std) - elif classname.find('LayerNorm') != -1: - if hasattr(m, 'weight'): + elif classname.find("LayerNorm") != -1: + if hasattr(m, "weight"): nn.init.normal_(m.weight, 1.0, self.config.init_std) - if hasattr(m, 'bias') and m.bias is not None: + if hasattr(m, "bias") and m.bias is not None: self._init_bias(m.bias) else: - if hasattr(m, 'r_emb'): + if hasattr(m, "r_emb"): self._init_weight(m.r_emb) - if hasattr(m, 'r_w_bias'): + if hasattr(m, "r_w_bias"): self._init_weight(m.r_w_bias) - if hasattr(m, 'r_r_bias'): + if hasattr(m, "r_r_bias"): self._init_weight(m.r_r_bias) - if hasattr(m, 'r_bias'): + if hasattr(m, "r_bias"): self._init_bias(m.r_bias) @@ -559,8 +561,12 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", - TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + TRANSFO_XL_START_DOCSTRING, + TRANSFO_XL_INPUTS_DOCSTRING, +) class TransfoXLModel(TransfoXLPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -587,6 +593,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): last_hidden_states, mems = outputs[:2] """ + def __init__(self, config): super(TransfoXLModel, self).__init__(config) self.output_attentions = config.output_attentions @@ -599,8 +606,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel): self.n_head = config.n_head self.d_head = config.d_head - self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, - div_val=config.div_val) + self.word_emb = AdaptiveEmbedding( + config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val + ) self.drop = nn.Dropout(config.dropout) @@ -618,27 +626,35 @@ class TransfoXLModel(TransfoXLPreTrainedModel): self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) self.layers = nn.ModuleList() - if config.attn_type == 0: # the default attention + if config.attn_type == 0: # the default attention for i in range(config.n_layer): self.layers.append( RelPartialLearnableDecoderLayer( - config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, - tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, - dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, + config.n_head, + config.d_model, + config.d_head, + config.d_inner, + config.dropout, + tgt_len=config.tgt_len, + ext_len=config.ext_len, + mem_len=config.mem_len, + dropatt=config.dropatt, + pre_lnorm=config.pre_lnorm, r_w_bias=None if config.untie_r else self.r_w_bias, r_r_bias=None if config.untie_r else self.r_r_bias, output_attentions=self.output_attentions, - layer_norm_epsilon=config.layer_norm_epsilon) + layer_norm_epsilon=config.layer_norm_epsilon, + ) ) - else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints + else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints raise NotImplementedError # Removed them to avoid maintaining dead code self.same_length = config.same_length self.clamp_len = config.clamp_len - if self.attn_type == 0: # default attention + if self.attn_type == 0: # default attention self.pos_emb = PositionalEmbedding(self.d_model) - else: # learnable embeddings and absolute embeddings + else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint self.init_weights() @@ -666,8 +682,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): mems = [] param = next(self.parameters()) for i in range(self.n_layer): - empty = torch.zeros(self.mem_len, bsz, self.config.d_model, - dtype=param.dtype, device=param.device) + empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device) mems.append(empty) return mems @@ -676,10 +691,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel): def _update_mems(self, hids, mems, qlen, mlen): # does not deal with None - if mems is None: return None + if mems is None: + return None # mems is not None - assert len(hids) == len(mems), 'len(hids) != len(mems)' + assert len(hids) == len(mems), "len(hids) != len(mems)" # There are `mlen + qlen` steps that can be cached into mems # For the next step, the last `ext_len` of the `qlen` tokens @@ -725,7 +741,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel): head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer @@ -743,17 +761,16 @@ class TransfoXLModel(TransfoXLPreTrainedModel): mask_shift_len = qlen - mask_len else: mask_shift_len = qlen - dec_attn_mask = (torch.triu(all_ones, 1+mlen) - + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 + dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 else: - dec_attn_mask = torch.triu( - word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] + dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[ + :, :, None + ] hids = [] attentions = [] - if self.attn_type == 0: # default - pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, - dtype=word_emb.dtype) + if self.attn_type == 0: # default + pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype) if self.clamp_len > 0: pos_seq.clamp_(max=self.clamp_len) pos_emb = self.pos_emb(pos_seq) @@ -764,12 +781,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel): for i, layer in enumerate(self.layers): hids.append(core_out) mems_i = None if mems is None else mems[i] - layer_outputs = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, - mems=mems_i, head_mask=head_mask[i]) + layer_outputs = layer( + core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i, head_mask=head_mask[i] + ) core_out = layer_outputs[0] if self.output_attentions: attentions.append(layer_outputs[1]) - else: # learnable embeddings and absolute embeddings + else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint core_out = self.drop(core_out) @@ -791,9 +809,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel): return outputs # last hidden state, new_mems, (all hidden states), (all attentions) -@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top +@add_start_docstrings( + """The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive input embeddings)""", - TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING) + TRANSFO_XL_START_DOCSTRING, + TRANSFO_XL_INPUTS_DOCSTRING, +) class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -830,6 +851,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): prediction_scores, mems = outputs[:2] """ + def __init__(self, config): super(TransfoXLLMHeadModel, self).__init__(config) self.transformer = TransfoXLModel(config) @@ -840,8 +862,9 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax) # use adaptive softmax (including standard softmax) else: - self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model, - config.cutoffs, div_val=config.div_val) + self.crit = ProjectedAdaptiveLogSoftmax( + config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val + ) self.init_weights() def tie_weights(self): @@ -856,8 +879,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): else: if self.config.tie_weight: for i in range(len(self.crit.out_layers)): - self._tie_or_clone_weights(self.crit.out_layers[i], - self.transformer.word_emb.emb_layers[i]) + self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i]) if self.config.tie_projs: for i, tie_proj in enumerate(self.config.tie_projs): if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: diff --git a/transformers/modeling_transfo_xl_utilities.py b/transformers/modeling_transfo_xl_utilities.py index 0773d0d5fc..89451bb558 100644 --- a/transformers/modeling_transfo_xl_utilities.py +++ b/transformers/modeling_transfo_xl_utilities.py @@ -28,9 +28,9 @@ import torch.nn.functional as F # CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) # CUDA_MINOR = int(torch.version.cuda.split('.')[1]) + class ProjectedAdaptiveLogSoftmax(nn.Module): - def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, - keep_order=False): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False): super(ProjectedAdaptiveLogSoftmax, self).__init__() self.n_token = n_token @@ -55,23 +55,19 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): if div_val == 1: for i in range(len(self.cutoffs)): if d_proj != d_embed: - self.out_projs.append( - nn.Parameter(torch.FloatTensor(d_proj, d_embed)) - ) + self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) else: self.out_projs.append(None) self.out_layers.append(nn.Linear(d_embed, n_token)) else: for i in range(len(self.cutoffs)): - l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] d_emb_i = d_embed // (div_val ** i) - self.out_projs.append( - nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)) - ) + self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) - self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) + self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx)) self.keep_order = keep_order @@ -90,7 +86,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): return logit def forward(self, hidden, labels=None, keep_order=False): - ''' + """ Params: hidden :: [len*bsz x d_proj] labels :: [len*bsz] @@ -102,20 +98,17 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): We could replace this implementation by the native PyTorch one if their's had an option to set bias on all clusters in the native one. here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138 - ''' + """ if labels is not None: labels = labels.view(-1) if hidden.size(0) != labels.size(0): - raise RuntimeError('Input and labels should have the same size ' - 'in the batch dimension.') + raise RuntimeError("Input and labels should have the same size " "in the batch dimension.") if self.n_clusters == 0: - logit = self._compute_logit(hidden, self.out_layers[0].weight, - self.out_layers[0].bias, self.out_projs[0]) + logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) if labels is not None: - out = -F.log_softmax(logit, dim=-1) \ - .gather(1, labels.unsqueeze(1)).squeeze(1) + out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1) else: out = F.log_softmax(logit, dim=-1) else: @@ -131,10 +124,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): bias_i = self.out_layers[i].bias if i == 0: - weight_i = torch.cat( - [weight_i, self.cluster_weight], dim=0) - bias_i = torch.cat( - [bias_i, self.cluster_bias], dim=0) + weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) weights.append(weight_i) biases.append(bias_i) @@ -171,7 +162,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): if labels is not None: logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1) else: - out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] + out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] else: weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] @@ -179,22 +170,22 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster if labels is not None: - logprob_i = head_logprob_i[:, cluster_prob_idx] \ - + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1) + logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather( + 1, target_i[:, None] + ).squeeze(1) else: logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i out[:, l_idx:r_idx] = logprob_i if labels is not None: - if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + if (hasattr(self, "keep_order") and self.keep_order) or keep_order: out.index_copy_(0, indices_i, -logprob_i) else: - out[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + out[offset : offset + logprob_i.size(0)].copy_(-logprob_i) offset += logprob_i.size(0) return out - def log_prob(self, hidden): r""" Computes log probabilities for all :math:`n\_classes` From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py @@ -209,8 +200,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): - Output: :math:`(N, n\_classes)` """ if self.n_clusters == 0: - logit = self._compute_logit(hidden, self.out_layers[0].weight, - self.out_layers[0].bias, self.out_projs[0]) + logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) return F.log_softmax(logit, dim=-1) else: # construct weights and biases @@ -225,10 +215,8 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): bias_i = self.out_layers[i].bias if i == 0: - weight_i = torch.cat( - [weight_i, self.cluster_weight], dim=0) - bias_i = torch.cat( - [bias_i, self.cluster_bias], dim=0) + weight_i = torch.cat([weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat([bias_i, self.cluster_bias], dim=0) weights.append(weight_i) biases.append(bias_i) @@ -244,7 +232,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1] if i == 0: - out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] + out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]] else: weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] @@ -270,10 +258,10 @@ class LogUniformSampler(object): """ with torch.no_grad(): self.range_max = range_max - log_indices = torch.arange(1., range_max+2., 1.).log_() + log_indices = torch.arange(1.0, range_max + 2.0, 1.0).log_() self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] - self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() + self.log_q = (-(-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() self.n_sample = n_sample @@ -298,6 +286,7 @@ class LogUniformSampler(object): samp_log_probs = self.log_q[neg_samples].to(device) return true_log_probs, samp_log_probs, neg_samples + def sample_logits(embedding, bias, labels, inputs, sampler): """ embedding: an nn.Embedding layer @@ -313,19 +302,17 @@ def sample_logits(embedding, bias, labels, inputs, sampler): b1, b2 = labels.size(0), labels.size(1) all_ids = torch.cat([labels.view(-1), neg_samples]) all_w = embedding(all_ids) - true_w = all_w[: -n_sample].view(b1, b2, -1) - sample_w = all_w[- n_sample:].view(n_sample, -1) + true_w = all_w[:-n_sample].view(b1, b2, -1) + sample_w = all_w[-n_sample:].view(n_sample, -1) all_b = bias[all_ids] - true_b = all_b[: -n_sample].view(b1, b2) - sample_b = all_b[- n_sample:] + true_b = all_b[:-n_sample].view(b1, b2) + sample_b = all_b[-n_sample:] hit = (labels[:, :, None] == neg_samples).detach() - true_logits = torch.einsum('ijk,ijk->ij', - [true_w, inputs]) + true_b - true_log_probs - sample_logits = torch.einsum('lk,ijk->ijl', - [sample_w, inputs]) + sample_b - samp_log_probs + true_logits = torch.einsum("ijk,ijk->ij", [true_w, inputs]) + true_b - true_log_probs + sample_logits = torch.einsum("lk,ijk->ijl", [sample_w, inputs]) + sample_b - samp_log_probs sample_logits.masked_fill_(hit, -1e30) logits = torch.cat([true_logits[:, :, None], sample_logits], -1) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 05e5ed3573..e934b90528 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -15,8 +15,7 @@ # limitations under the License. """PyTorch BERT model.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import copy import json @@ -31,8 +30,15 @@ from torch.nn import CrossEntropyLoss from torch.nn import functional as F from .configuration_utils import PretrainedConfig -from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS, - cached_path, hf_bucket_url, is_remote_url) +from .file_utils import ( + TF2_WEIGHTS_NAME, + TF_WEIGHTS_NAME, + WEIGHTS_NAME, + DUMMY_INPUTS, + cached_path, + hf_bucket_url, + is_remote_url, +) logger = logging.getLogger(__name__) @@ -43,12 +49,14 @@ except ImportError: class Identity(nn.Module): r"""A placeholder identity operator that is argument-insensitive. """ + def __init__(self, *args, **kwargs): super(Identity, self).__init__() def forward(self, input): return input + class PreTrainedModel(nn.Module): r""" Base class for all models. @@ -78,7 +86,7 @@ class PreTrainedModel(nn.Module): Returns: torch.Tensor with dummy inputs """ - return {'input_ids': torch.tensor(DUMMY_INPUTS)} + return {"input_ids": torch.tensor(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): super(PreTrainedModel, self).__init__() @@ -88,7 +96,8 @@ class PreTrainedModel(nn.Module): "To create a model from a pretrained model use " "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ - )) + ) + ) # Save config in model self.config = config @@ -136,14 +145,14 @@ class PreTrainedModel(nn.Module): else: output_embeddings.weight = input_embeddings.weight - if hasattr(output_embeddings, 'bias') and output_embeddings.bias is not None: + if hasattr(output_embeddings, "bias") and output_embeddings.bias is not None: output_embeddings.bias.data = torch.nn.functional.pad( output_embeddings.bias.data, (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]), - 'constant', - 0 + "constant", + 0, ) - if hasattr(output_embeddings, 'out_features') and hasattr(input_embeddings, 'num_embeddings'): + if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): output_embeddings.out_features = input_embeddings.num_embeddings def resize_token_embeddings(self, new_num_tokens=None): @@ -244,10 +253,12 @@ class PreTrainedModel(nn.Module): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. """ - assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" + assert os.path.isdir( + save_directory + ), "Saving path should be a directory where the model and configuration can be saved" # Only save the model itself if we are using distributed training - model_to_save = self.module if hasattr(self, 'module') else self + model_to_save = self.module if hasattr(self, "module") else self # Save configuration file model_to_save.config.save_pretrained(save_directory) @@ -329,21 +340,23 @@ class PreTrainedModel(nn.Module): model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - config = kwargs.pop('config', None) - state_dict = kwargs.pop('state_dict', None) - cache_dir = kwargs.pop('cache_dir', None) - from_tf = kwargs.pop('from_tf', False) - force_download = kwargs.pop('force_download', False) - resume_download = kwargs.pop('resume_download', False) - proxies = kwargs.pop('proxies', None) - output_loading_info = kwargs.pop('output_loading_info', False) + config = kwargs.pop("config", None) + state_dict = kwargs.pop("state_dict", None) + cache_dir = kwargs.pop("cache_dir", None) + from_tf = kwargs.pop("from_tf", False) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + output_loading_info = kwargs.pop("output_loading_info", False) # Load config if we don't provide a configuration if not isinstance(config, PretrainedConfig): config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( - config_path, *model_args, - cache_dir=cache_dir, return_unused_kwargs=True, + config_path, + *model_args, + cache_dir=cache_dir, + return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, proxies=proxies, @@ -367,43 +380,56 @@ class PreTrainedModel(nn.Module): # Load from a PyTorch checkpoint archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) else: - raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format( - [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], - pretrained_model_name_or_path)) + raise EnvironmentError( + "Error no file named {} found in directory {} or `from_tf` set to False".format( + [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], pretrained_model_name_or_path + ) + ) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): - assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( - pretrained_model_name_or_path + ".index") + assert ( + from_tf + ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( + pretrained_model_name_or_path + ".index" + ) archive_file = pretrained_model_name_or_path + ".index" else: archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME) if from_tf: - raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.") + raise EnvironmentError( + "Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name." + ) # redirect to the cache, if necessary try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, - proxies=proxies, resume_download=resume_download) + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + ) except EnvironmentError: if pretrained_model_name_or_path in cls.pretrained_model_archive_map: - msg = "Couldn't reach server at '{}' to download pretrained weights.".format( - archive_file) + msg = "Couldn't reach server at '{}' to download pretrained weights.".format(archive_file) else: - msg = "Model name '{}' was not found in model name list ({}). " \ - "We assumed '{}' was a path or url to model weight files named one of {} but " \ + msg = ( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url to model weight files named one of {} but " "couldn't find any such file at this path or url.".format( pretrained_model_name_or_path, - ', '.join(cls.pretrained_model_archive_map.keys()), + ", ".join(cls.pretrained_model_archive_map.keys()), archive_file, - [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME]) + [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME], + ) + ) raise EnvironmentError(msg) if resolved_archive_file == archive_file: logger.info("loading weights file {}".format(archive_file)) else: - logger.info("loading weights file {} from cache at {}".format( - archive_file, resolved_archive_file)) + logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) else: resolved_archive_file = None @@ -412,27 +438,32 @@ class PreTrainedModel(nn.Module): if state_dict is None and not from_tf: try: - state_dict = torch.load(resolved_archive_file, map_location='cpu') + state_dict = torch.load(resolved_archive_file, map_location="cpu") except: - raise OSError("Unable to load weights from pytorch checkpoint file. " - "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ") + raise OSError( + "Unable to load weights from pytorch checkpoint file. " + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. " + ) missing_keys = [] unexpected_keys = [] error_msgs = [] if from_tf: - if resolved_archive_file.endswith('.index'): + if resolved_archive_file.endswith(".index"): # Load from a TensorFlow 1.X checkpoint - provided by original authors model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index' else: # Load from our TensorFlow 2.0 checkpoints try: from transformers import load_tf2_checkpoint_in_pytorch_model + model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True) except ImportError as e: - logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " - "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) raise e else: # Convert old format to new format if needed from a PyTorch state_dict @@ -440,10 +471,10 @@ class PreTrainedModel(nn.Module): new_keys = [] for key in state_dict.keys(): new_key = None - if 'gamma' in key: - new_key = key.replace('gamma', 'weight') - if 'beta' in key: - new_key = key.replace('beta', 'bias') + if "gamma" in key: + new_key = key.replace("gamma", "weight") + if "beta" in key: + new_key = key.replace("beta", "bias") if new_key: old_keys.append(key) new_keys.append(new_key) @@ -451,39 +482,53 @@ class PreTrainedModel(nn.Module): state_dict[new_key] = state_dict.pop(old_key) # copy state_dict so _load_from_state_dict can modify it - metadata = getattr(state_dict, '_metadata', None) + metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants # so we need to apply the function recursively. - def load(module, prefix=''): + def load(module, prefix=""): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( - state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs + ) for name, child in module._modules.items(): if child is not None: - load(child, prefix + name + '.') + load(child, prefix + name + ".") # Make sure we are able to load base models as well as derived models (with heads) - start_prefix = '' + start_prefix = "" model_to_load = model - if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()): - start_prefix = cls.base_model_prefix + '.' - if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()): + if not hasattr(model, cls.base_model_prefix) and any( + s.startswith(cls.base_model_prefix) for s in state_dict.keys() + ): + start_prefix = cls.base_model_prefix + "." + if hasattr(model, cls.base_model_prefix) and not any( + s.startswith(cls.base_model_prefix) for s in state_dict.keys() + ): model_to_load = getattr(model, cls.base_model_prefix) load(model_to_load, prefix=start_prefix) if len(missing_keys) > 0: - logger.info("Weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, missing_keys)) + logger.info( + "Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys + ) + ) if len(unexpected_keys) > 0: - logger.info("Weights from pretrained model not used in {}: {}".format( - model.__class__.__name__, unexpected_keys)) + logger.info( + "Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys + ) + ) if len(error_msgs) > 0: - raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( - model.__class__.__name__, "\n\t".join(error_msgs))) + raise RuntimeError( + "Error(s) in loading state_dict for {}:\n\t{}".format( + model.__class__.__name__, "\n\t".join(error_msgs) + ) + ) model.tie_weights() # make sure word embedding weights are still tied if needed @@ -500,10 +545,22 @@ class PreTrainedModel(nn.Module): return {"input_ids": input_ids} @torch.no_grad() - def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None, - temperature=None, top_k=None, top_p=None, repetition_penalty=None, - bos_token_id=None, pad_token_id=None, eos_token_ids=None, - length_penalty=None, num_return_sequences=None): + def generate( + self, + input_ids=None, + max_length=None, + do_sample=None, + num_beams=None, + temperature=None, + top_k=None, + top_p=None, + repetition_penalty=None, + bos_token_id=None, + pad_token_id=None, + eos_token_ids=None, + length_penalty=None, + num_return_sequences=None, + ): """ Sequence generator for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling @@ -543,8 +600,10 @@ class PreTrainedModel(nn.Module): # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: - raise AttributeError("You tried to generate sequences with a model that does not have a LM Head." - "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)") + raise AttributeError( + "You tried to generate sequences with a model that does not have a LM Head." + "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)" + ) max_length = max_length if max_length is not None else self.config.max_length do_sample = do_sample if do_sample is not None else self.config.do_sample @@ -557,7 +616,9 @@ class PreTrainedModel(nn.Module): pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty - num_return_sequences = num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) if input_ids is not None: batch_size = input_ids.shape[0] # overriden by the input batch_size @@ -575,13 +636,18 @@ class PreTrainedModel(nn.Module): assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert isinstance(bos_token_id, int) and bos_token_id >= 0, "`bos_token_id` should be a positive integer." assert isinstance(pad_token_id, int) and pad_token_id >= 0, "`pad_token_id` should be a positive integer." - assert isinstance(eos_token_ids, (list, tuple)) and (e >= 0 for e in eos_token_ids), \ - "`eos_token_ids` should be a positive integer or a list/tuple of positive integers." + assert isinstance(eos_token_ids, (list, tuple)) and ( + e >= 0 for e in eos_token_ids + ), "`eos_token_ids` should be a positive integer or a list/tuple of positive integers." assert length_penalty > 0, "`length_penalty` should be strictely positive." - assert isinstance(num_return_sequences, int) and num_return_sequences > 0, "`num_return_sequences` should be a strictely positive integer." + assert ( + isinstance(num_return_sequences, int) and num_return_sequences > 0 + ), "`num_return_sequences` should be a strictely positive integer." if input_ids is None: - input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device) + input_ids = torch.full( + (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device + ) else: assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." @@ -592,28 +658,63 @@ class PreTrainedModel(nn.Module): if num_return_sequences != 1: # Expand input to num return sequences input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len) - input_ids = input_ids.contiguous().view(batch_size * num_return_sequences, cur_len) # (batch_size * num_return_sequences, cur_len) + input_ids = input_ids.contiguous().view( + batch_size * num_return_sequences, cur_len + ) # (batch_size * num_return_sequences, cur_len) effective_batch_size = batch_size * num_return_sequences else: effective_batch_size = batch_size if num_beams > 1: - output = self._generate_beam_search(input_ids, cur_len, max_length, do_sample, - temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, effective_batch_size, - length_penalty, num_beams, vocab_size) + output = self._generate_beam_search( + input_ids, + cur_len, + max_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + pad_token_id, + eos_token_ids, + effective_batch_size, + length_penalty, + num_beams, + vocab_size, + ) else: - output = self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample, - temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, effective_batch_size) + output = self._generate_no_beam_search( + input_ids, + cur_len, + max_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + pad_token_id, + eos_token_ids, + effective_batch_size, + ) if num_return_sequences != 1: output = output.view(batch_size, num_return_sequences, -1) return output - def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample, - temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size): + def _generate_no_beam_search( + self, + input_ids, + cur_len, + max_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + pad_token_id, + eos_token_ids, + batch_size, + ): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ @@ -663,23 +764,38 @@ class PreTrainedModel(nn.Module): return input_ids - def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample, - temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size, - length_penalty, num_beams, vocab_size): + def _generate_beam_search( + self, + input_ids, + cur_len, + max_length, + do_sample, + temperature, + top_k, + top_p, + repetition_penalty, + pad_token_id, + eos_token_ids, + batch_size, + length_penalty, + num_beams, + vocab_size, + ): """ Generate sequences for each example with beam search. """ # Expand input to num beams input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len) - input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # (batch_size * num_beams, cur_len) + input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # (batch_size * num_beams, cur_len) # generated hypotheses - generated_hyps = [BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)] + generated_hyps = [ + BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size) + ] # scores for each sentence in the beam beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) + beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) # cache compute states pasts = None # self.prepare_pasts() @@ -689,8 +805,8 @@ class PreTrainedModel(nn.Module): while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) - scores = self(**model_inputs)[0] # (batch_size * num_beams, cur_len, vocab_size) - scores = scores[:, -1, :] # (batch_size * num_beams, vocab_size) + scores = self(**model_inputs)[0] # (batch_size * num_beams, cur_len, vocab_size) + scores = scores[:, -1, :] # (batch_size * num_beams, vocab_size) # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: @@ -703,25 +819,27 @@ class PreTrainedModel(nn.Module): if temperature > 0 and temperature != 1.0: scores = scores / temperature # Top-p/top-k filtering - scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2) # (batch_size * num_beams, vocab_size) + scores = top_k_top_p_filtering( + scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 + ) # (batch_size * num_beams, vocab_size) # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search) - next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2) # (batch_size * num_beams, 2) + next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2) # (batch_size * num_beams, 2) # Compute next scores - _scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) - _scores = torch.gather(_scores, -1, next_words) # (batch_size * num_beams, 2) - next_scores = _scores + beam_scores[:, None].expand_as(_scores) # (batch_size * num_beams, 2) + _scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) + _scores = torch.gather(_scores, -1, next_words) # (batch_size * num_beams, 2) + next_scores = _scores + beam_scores[:, None].expand_as(_scores) # (batch_size * num_beams, 2) # Match shape of greedy beam search - next_words = next_words.view(batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) - next_scores = next_scores.view(batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) + next_words = next_words.view(batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) + next_scores = next_scores.view(batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) else: # do greedy beam search - scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) + scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) assert scores.size() == (batch_size * num_beams, vocab_size) # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) - _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) - _scores = _scores.view(batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) - next_scores, next_words = torch.topk(_scores, 2*num_beams, dim=1, largest=True, sorted=True) + _scores = _scores.view(batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) + next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams) @@ -750,7 +868,9 @@ class PreTrainedModel(nn.Module): # end of sentence, or next word if word_id.item() in eos_token_ids or cur_len + 1 == max_length: - generated_hyps[batch_ex].add(input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item()) + generated_hyps[batch_ex].add( + input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item() + ) else: next_sent_beam.append((score, word_id, batch_ex * num_beams + beam_id)) @@ -807,13 +927,13 @@ class PreTrainedModel(nn.Module): # generate target batch decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id) for i, hypo in enumerate(best): - decoded[i, :tgt_len[i] - 1] = hypo + decoded[i, : tgt_len[i] - 1] = hypo decoded[i, tgt_len[i] - 1] = eos_token_ids[0] return decoded -def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf'), min_tokens_to_keep=1): +def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size, vocabulary size) @@ -849,7 +969,6 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf') class BeamHypotheses(object): - def __init__(self, n_hyp, max_length, length_penalty, early_stopping): """ Initialize n-best list of hypotheses. @@ -915,6 +1034,7 @@ class Conv1D(nn.Module): class PoolerStartLogits(nn.Module): """ Compute SQuAD start_logits from sequence hidden states. """ + def __init__(self, config): super(PoolerStartLogits, self).__init__() self.dense = nn.Linear(config.hidden_size, 1) @@ -939,6 +1059,7 @@ class PoolerStartLogits(nn.Module): class PoolerEndLogits(nn.Module): """ Compute SQuAD end_logits from sequence hidden states and start token hidden state. """ + def __init__(self, config): super(PoolerEndLogits, self).__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) @@ -959,12 +1080,14 @@ class PoolerEndLogits(nn.Module): Mask of invalid position such as query and special symbols (PAD, SEP, CLS) 1.0 means token should be masked. """ - assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None" + assert ( + start_states is not None or start_positions is not None + ), "One of start_states, start_positions should be not None" if start_positions is not None: slen, hsz = hidden_states.shape[-2:] - start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) - start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) - start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) + start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) x = self.activation(x) @@ -982,6 +1105,7 @@ class PoolerEndLogits(nn.Module): class PoolerAnswerClass(nn.Module): """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ + def __init__(self, config): super(PoolerAnswerClass, self).__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) @@ -1006,16 +1130,18 @@ class PoolerAnswerClass(nn.Module): for each sample """ hsz = hidden_states.shape[-1] - assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None" + assert ( + start_states is not None or start_positions is not None + ), "One of start_states, start_positions should be not None" if start_positions is not None: - start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) - start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) if cls_index is not None: - cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) - cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) + cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) else: - cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) + cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) x = self.activation(x) @@ -1064,6 +1190,7 @@ class SQuADHead(nn.Module): ``torch.FloatTensor`` of shape ``(batch_size,)`` Log probabilities for the ``is_impossible`` label of the answers. """ + def __init__(self, config): super(SQuADHead, self).__init__() self.start_n_top = config.start_n_top @@ -1073,8 +1200,9 @@ class SQuADHead(nn.Module): self.end_logits = PoolerEndLogits(config) self.answer_class = PoolerAnswerClass(config) - def forward(self, hidden_states, start_positions=None, end_positions=None, - cls_index=None, is_impossible=None, p_mask=None): + def forward( + self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None + ): outputs = () start_logits = self.start_logits(hidden_states, p_mask=p_mask) @@ -1107,19 +1235,25 @@ class SQuADHead(nn.Module): else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() - start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) + start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) - start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top) - start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) - start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) - start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) + start_top_log_probs, start_top_index = torch.topk( + start_log_probs, self.start_n_top, dim=-1 + ) # shape (bsz, start_n_top) + start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) + start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) + start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) - hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz) + hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( + start_states + ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) - end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) - end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top) + end_top_log_probs, end_top_index = torch.topk( + end_log_probs, self.end_n_top, dim=1 + ) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) @@ -1148,34 +1282,35 @@ class SequenceSummary(nn.Module): summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ + def __init__(self, config): super(SequenceSummary, self).__init__() - self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last' - if self.summary_type == 'attn': + self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last" + if self.summary_type == "attn": # We should use a standard multi-head attention module with absolute positional embedding for that. # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # We can probably just use the multi-head attention module of PyTorch >=1.1.0 raise NotImplementedError self.summary = Identity() - if hasattr(config, 'summary_use_proj') and config.summary_use_proj: - if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0: + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: num_classes = config.num_labels else: num_classes = config.hidden_size self.summary = nn.Linear(config.hidden_size, num_classes) self.activation = Identity() - if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': + if hasattr(config, "summary_activation") and config.summary_activation == "tanh": self.activation = nn.Tanh() self.first_dropout = Identity() - if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0: + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: self.first_dropout = nn.Dropout(config.summary_first_dropout) self.last_dropout = Identity() - if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0: + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: self.last_dropout = nn.Dropout(config.summary_last_dropout) def forward(self, hidden_states, cls_index=None): @@ -1185,21 +1320,21 @@ class SequenceSummary(nn.Module): if summary_type == 'cls_index' and cls_index is None: we take the last token of the sequence as classification token """ - if self.summary_type == 'last': + if self.summary_type == "last": output = hidden_states[:, -1] - elif self.summary_type == 'first': + elif self.summary_type == "first": output = hidden_states[:, 0] - elif self.summary_type == 'mean': + elif self.summary_type == "mean": output = hidden_states.mean(dim=1) - elif self.summary_type == 'cls_index': + elif self.summary_type == "cls_index": if cls_index is None: - cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long) + cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long) else: cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) - cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states - output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) - elif self.summary_type == 'attn': + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": raise NotImplementedError output = self.first_dropout(output) diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py index 5135f1e884..cd758a0433 100644 --- a/transformers/modeling_xlm.py +++ b/transformers/modeling_xlm.py @@ -34,24 +34,21 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) XLM_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin", - 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin", - 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin", - 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin", - 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin", - 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin", - 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin", - 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin", - 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin", - 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin", + "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin", + "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin", + "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin", + "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin", + "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin", + "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin", + "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin", + "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin", + "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin", + "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin", } def create_sinusoidal_embeddings(n_pos, dim, out): - position_enc = np.array([ - [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] - for pos in range(n_pos) - ]) + position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]) out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) out.detach_() @@ -142,7 +139,7 @@ class MultiHeadAttention(nn.Module): # Mask is (bs, klen) (non-causal) or (bs, klen, klen) bs, qlen, dim = input.size() if kv is None: - klen = qlen if cache is None else cache['slen'] + qlen + klen = qlen if cache is None else cache["slen"] + qlen else: klen = kv.size(1) # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) @@ -158,39 +155,39 @@ class MultiHeadAttention(nn.Module): """ compute context """ return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) - q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) + q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: - k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) elif cache is None or self.layer_id not in cache: k = v = kv - k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) - v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) + k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) if cache is not None: if self.layer_id in cache: if kv is None: k_, v_ = cache[self.layer_id] - k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) - v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) + k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) + v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) else: k, v = cache[self.layer_id] cache[self.layer_id] = (k, v) - q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) - scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) - mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) - scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) + mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) + scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, qlen, klen) - weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) + weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: weights = weights * head_mask - context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) - context = unshape(context) # (bs, qlen, dim) + context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) outputs = (self.out_lin(context),) if self.output_attentions: @@ -199,7 +196,6 @@ class MultiHeadAttention(nn.Module): class TransformerFFN(nn.Module): - def __init__(self, in_dim, dim_hidden, out_dim, config): super(TransformerFFN, self).__init__() self.dropout = config.dropout @@ -219,6 +215,7 @@ class XLMPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = XLMConfig pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = None @@ -235,7 +232,7 @@ class XLMPreTrainedModel(PreTrainedModel): langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) else: langs_list = None - return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list} + return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list} def _init_weights(self, module): """ Initialize the weights. """ @@ -245,8 +242,8 @@ class XLMPreTrainedModel(PreTrainedModel): if isinstance(module, nn.Linear): if self.config is not None and self.config.init_std is not None: nn.init.normal_(module.weight, mean=0, std=self.config.init_std) - if hasattr(module, 'bias') and module.bias is not None: - nn.init.constant_(module.bias, 0.) + if hasattr(module, "bias") and module.bias is not None: + nn.init.constant_(module.bias, 0.0) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) @@ -327,8 +324,12 @@ XLM_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.", - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.", + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class XLMModel(XLMPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -351,7 +352,8 @@ class XLMModel(XLMPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ - def __init__(self, config): #, dico, is_encoder, with_output): + + def __init__(self, config): # , dico, is_encoder, with_output): super(XLMModel, self).__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -377,13 +379,13 @@ class XLMModel(XLMPreTrainedModel): # assert len(self.id2lang) == len(self.lang2id) == self.n_langs # model parameters - self.dim = config.emb_dim # 512 by default + self.dim = config.emb_dim # 512 by default self.hidden_dim = self.dim * 4 # 2048 by default - self.n_heads = config.n_heads # 8 by default + self.n_heads = config.n_heads # 8 by default self.n_layers = config.n_layers self.dropout = config.dropout self.attention_dropout = config.attention_dropout - assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads' + assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads" # embeddings self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) @@ -435,8 +437,18 @@ class XLMModel(XLMPreTrainedModel): for layer, heads in heads_to_prune.items(): self.attentions[layer].prune_heads(heads) - def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, - lengths=None, cache=None, head_mask=None, inputs_embeds=None): # removed: src_enc=None, src_len=None + def forward( + self, + input_ids=None, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + ): # removed: src_enc=None, src_len=None if input_ids is not None: bs, slen = input_ids.size() else: @@ -446,7 +458,7 @@ class XLMModel(XLMPreTrainedModel): if input_ids is not None: lengths = (input_ids != self.pad_index).sum(dim=1).long() else: - lengths = torch.LongTensor([slen]*bs) + lengths = torch.LongTensor([slen] * bs) # mask = input_ids != self.pad_index # check inputs @@ -488,14 +500,18 @@ class XLMModel(XLMPreTrainedModel): head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = ( + head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) + ) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layers # do not recompute cached elements if cache is not None and input_ids is not None: - _slen = slen - cache['slen'] + _slen = slen - cache["slen"] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] if langs is not None: @@ -550,7 +566,7 @@ class XLMModel(XLMPreTrainedModel): # update cache length if cache is not None: - cache['slen'] += tensor.size(1) + cache["slen"] += tensor.size(1) # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) @@ -567,6 +583,7 @@ class XLMPredLayer(nn.Module): """ Prediction layer (cross_entropy or adaptive_softmax). """ + def __init__(self, config): super(XLMPredLayer, self).__init__() self.asm = config.asm @@ -593,7 +610,7 @@ class XLMPredLayer(nn.Module): scores = self.proj(x) outputs = (scores,) + outputs if y is not None: - loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean') + loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean") outputs = (loss,) + outputs else: scores = self.proj.log_prob(x) @@ -605,9 +622,12 @@ class XLMPredLayer(nn.Module): return outputs -@add_start_docstrings("""The XLM Model transformer with a language modeling head on top +@add_start_docstrings( + """The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class XLMWithLMHeadModel(XLMPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -639,6 +659,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(XLMWithLMHeadModel, self).__init__(config) self.transformer = XLMModel(config) @@ -661,17 +682,30 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): langs = None return {"input_ids": input_ids, "langs": langs} - def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, - lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - langs=langs, - token_type_ids=token_type_ids, - position_ids=position_ids, - lengths=lengths, - cache=cache, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) output = transformer_outputs[0] outputs = self.pred_layer(output, labels) @@ -680,9 +714,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): return outputs -@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class XLMForSequenceClassification(XLMPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -714,6 +751,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(XLMForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels @@ -723,17 +761,30 @@ class XLMForSequenceClassification(XLMPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, - lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - langs=langs, - token_type_ids=token_type_ids, - position_ids=position_ids, - lengths=lengths, - cache=cache, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) output = transformer_outputs[0] logits = self.sequence_summary(output) @@ -753,9 +804,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel): return outputs -@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -799,6 +853,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): loss, start_scores, end_scores = outputs[:2] """ + def __init__(self, config): super(XLMForQuestionAnsweringSimple, self).__init__(config) @@ -807,17 +862,31 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, - lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - langs=langs, - token_type_ids=token_type_ids, - position_ids=position_ids, - lengths=lengths, - cache=cache, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = transformer_outputs[0] @@ -826,7 +895,10 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) - outputs = (start_logits, end_logits,) + outputs = ( + start_logits, + end_logits, + ) if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: @@ -849,9 +921,12 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): return outputs -@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) + XLM_START_DOCSTRING, + XLM_INPUTS_DOCSTRING, +) class XLMForQuestionAnswering(XLMPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -895,6 +970,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): loss, start_scores, end_scores = outputs[:2] """ + def __init__(self, config): super(XLMForQuestionAnswering, self).__init__(config) @@ -903,23 +979,45 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, - lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, - is_impossible=None, cls_index=None, p_mask=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - langs=langs, - token_type_ids=token_type_ids, - position_ids=position_ids, - lengths=lengths, - cache=cache, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + is_impossible=None, + cls_index=None, + p_mask=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) output = transformer_outputs[0] - outputs = self.qa_outputs(output, start_positions=start_positions, end_positions=end_positions, - cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask) + outputs = self.qa_outputs( + output, + start_positions=start_positions, + end_positions=end_positions, + cls_index=cls_index, + is_impossible=is_impossible, + p_mask=p_mask, + ) outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py index 0bdce941a5..8f1ed6ec67 100644 --- a/transformers/modeling_xlm_roberta.py +++ b/transformers/modeling_xlm_roberta.py @@ -15,24 +15,29 @@ # limitations under the License. """PyTorch XLM-RoBERTa model. """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging -from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification +from .modeling_roberta import ( + RobertaModel, + RobertaForMaskedLM, + RobertaForSequenceClassification, + RobertaForMultipleChoice, + RobertaForTokenClassification, +) from .configuration_xlm_roberta import XLMRobertaConfig from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin", - 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin", - 'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin", - 'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin", - 'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin", - 'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin", + "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin", + "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin", + "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin", + "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin", + "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin", + "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin", } @@ -105,8 +110,12 @@ XLM_ROBERTA_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", - XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", + XLM_ROBERTA_START_DOCSTRING, + XLM_ROBERTA_INPUTS_DOCSTRING, +) class XLMRobertaModel(RobertaModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -154,8 +163,11 @@ class XLMRobertaModel(RobertaModel): pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""XLM-RoBERTa Model with a `language modeling` head on top. """, - XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) +@add_start_docstrings( + """XLM-RoBERTa Model with a `language modeling` head on top. """, + XLM_ROBERTA_START_DOCSTRING, + XLM_ROBERTA_INPUTS_DOCSTRING, +) class XLMRobertaForMaskedLM(RobertaForMaskedLM): r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -190,9 +202,12 @@ class XLMRobertaForMaskedLM(RobertaForMaskedLM): pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer +@add_start_docstrings( + """XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) + XLM_ROBERTA_START_DOCSTRING, + XLM_ROBERTA_INPUTS_DOCSTRING, +) class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -228,9 +243,12 @@ class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of +@add_start_docstrings( + """XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) + XLM_ROBERTA_START_DOCSTRING, + XLM_ROBERTA_INPUTS_DOCSTRING, +) class XLMRobertaForMultipleChoice(RobertaForMultipleChoice): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -262,9 +280,12 @@ class XLMRobertaForMultipleChoice(RobertaForMultipleChoice): pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -@add_start_docstrings("""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of +@add_start_docstrings( + """XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) + XLM_ROBERTA_START_DOCSTRING, + XLM_ROBERTA_INPUTS_DOCSTRING, +) class XLMRobertaForTokenClassification(RobertaForTokenClassification): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py index 3109fd8cdf..2a210502d9 100644 --- a/transformers/modeling_xlnet.py +++ b/transformers/modeling_xlnet.py @@ -29,7 +29,14 @@ from torch import nn from torch.nn import functional as F from torch.nn import CrossEntropyLoss, MSELoss -from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits +from .modeling_utils import ( + PreTrainedModel, + prune_linear_layer, + SequenceSummary, + PoolerAnswerClass, + PoolerEndLogits, + PoolerStartLogits, +) from .configuration_xlnet import XLNetConfig from .file_utils import add_start_docstrings @@ -37,8 +44,8 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin", - 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin", + "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin", + "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin", } @@ -50,44 +57,53 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): tf_to_pt_map = {} - if hasattr(model, 'transformer'): - if hasattr(model, 'lm_loss'): + if hasattr(model, "transformer"): + if hasattr(model, "lm_loss"): # We will load also the output bias - tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias - if hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights: + tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias + if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights: # We will load also the sequence summary - tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight - tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias - if hasattr(model, 'logits_proj') and config.finetuning_task is not None \ - and 'model/regression_{}/logit/kernel'.format(config.finetuning_task) in tf_weights: - tf_to_pt_map['model/regression_{}/logit/kernel'.format(config.finetuning_task)] = model.logits_proj.weight - tf_to_pt_map['model/regression_{}/logit/bias'.format(config.finetuning_task)] = model.logits_proj.bias + tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight + tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias + if ( + hasattr(model, "logits_proj") + and config.finetuning_task is not None + and "model/regression_{}/logit/kernel".format(config.finetuning_task) in tf_weights + ): + tf_to_pt_map["model/regression_{}/logit/kernel".format(config.finetuning_task)] = model.logits_proj.weight + tf_to_pt_map["model/regression_{}/logit/bias".format(config.finetuning_task)] = model.logits_proj.bias # Now load the rest of the transformer model = model.transformer # Embeddings and output - tf_to_pt_map.update({'model/transformer/word_embedding/lookup_table': model.word_embedding.weight, - 'model/transformer/mask_emb/mask_emb': model.mask_emb}) + tf_to_pt_map.update( + { + "model/transformer/word_embedding/lookup_table": model.word_embedding.weight, + "model/transformer/mask_emb/mask_emb": model.mask_emb, + } + ) # Transformer blocks for i, b in enumerate(model.layer): layer_str = "model/transformer/layer_%d/" % i - tf_to_pt_map.update({ - layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight, - layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias, - layer_str + "rel_attn/o/kernel": b.rel_attn.o, - layer_str + "rel_attn/q/kernel": b.rel_attn.q, - layer_str + "rel_attn/k/kernel": b.rel_attn.k, - layer_str + "rel_attn/r/kernel": b.rel_attn.r, - layer_str + "rel_attn/v/kernel": b.rel_attn.v, - layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight, - layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias, - layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight, - layer_str + "ff/layer_1/bias": b.ff.layer_1.bias, - layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight, - layer_str + "ff/layer_2/bias": b.ff.layer_2.bias, - }) + tf_to_pt_map.update( + { + layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight, + layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias, + layer_str + "rel_attn/o/kernel": b.rel_attn.o, + layer_str + "rel_attn/q/kernel": b.rel_attn.q, + layer_str + "rel_attn/k/kernel": b.rel_attn.k, + layer_str + "rel_attn/r/kernel": b.rel_attn.r, + layer_str + "rel_attn/v/kernel": b.rel_attn.v, + layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight, + layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias, + layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight, + layer_str + "ff/layer_1/bias": b.ff.layer_1.bias, + layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight, + layer_str + "ff/layer_2/bias": b.ff.layer_2.bias, + } + ) # Relative positioning biases if config.untie_r: @@ -105,13 +121,17 @@ def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): r_w_list = [model.r_w_bias] r_s_list = [model.r_s_bias] seg_embed_list = [model.seg_embed] - tf_to_pt_map.update({ - 'model/transformer/r_r_bias': r_r_list, - 'model/transformer/r_w_bias': r_w_list, - 'model/transformer/r_s_bias': r_s_list, - 'model/transformer/seg_embed': seg_embed_list}) + tf_to_pt_map.update( + { + "model/transformer/r_r_bias": r_r_list, + "model/transformer/r_w_bias": r_w_list, + "model/transformer/r_s_bias": r_s_list, + "model/transformer/seg_embed": seg_embed_list, + } + ) return tf_to_pt_map + def load_tf_weights_in_xlnet(model, config, tf_path): """ Load tf checkpoints in a pytorch model """ @@ -119,8 +139,10 @@ def load_tf_weights_in_xlnet(model, config, tf_path): import numpy as np import tensorflow as tf except ImportError: - logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") + logger.error( + "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) raise # Load weights from TF model init_vars = tf.train.list_variables(tf_path) @@ -141,7 +163,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path): array = tf_weights[name] # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model - if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name): + if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name): logger.info("Transposing") array = np.transpose(array) if isinstance(pointer, list): @@ -165,10 +187,10 @@ def load_tf_weights_in_xlnet(model, config, tf_path): logger.info("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) tf_weights.pop(name, None) - tf_weights.pop(name + '/Adam', None) - tf_weights.pop(name + '/Adam_1', None) + tf_weights.pop(name + "/Adam", None) + tf_weights.pop(name + "/Adam_1", None) - logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + logger.info("Weights not copied to PyTorch model: {}".format(", ".join(tf_weights.keys()))) return model @@ -199,7 +221,8 @@ class XLNetRelativeAttention(nn.Module): if config.d_model % config.n_head != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.d_model, config.n_head)) + "heads (%d)" % (config.d_model, config.n_head) + ) self.n_head = config.n_head self.d_head = config.d_head @@ -242,7 +265,7 @@ class XLNetRelativeAttention(nn.Module): x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2]) x = x[:, :, 1:, :] - x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3]-1) + x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1) # Note: the tensor-slice form was faster in my testing than torch.index_select # However, tracing doesn't like the nature of the slice, and if klen changes # during the run then it'll fail, whereas index_select will be fine. @@ -255,27 +278,27 @@ class XLNetRelativeAttention(nn.Module): """Core relative positional attention operations.""" # content based attention score - ac = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_w_bias, k_head_h) + ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h) # position based attention score - bd = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_r_bias, k_head_r) + bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r) bd = self.rel_shift_bnij(bd, klen=ac.shape[3]) # segment based attention score if seg_mat is None: ef = 0 else: - ef = torch.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed) - ef = torch.einsum('ijbs,ibns->bnij', seg_mat, ef) + ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed) + ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef) # merge attention scores and perform masking attn_score = (ac + bd + ef) * self.scale if attn_mask is not None: # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask if attn_mask.dtype == torch.float16: - attn_score = attn_score - 65500 * torch.einsum('ijbn->bnij', attn_mask) + attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask) else: - attn_score = attn_score - 1e30 * torch.einsum('ijbn->bnij', attn_mask) + attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask) # attention probability attn_prob = F.softmax(attn_score, dim=3) @@ -283,20 +306,20 @@ class XLNetRelativeAttention(nn.Module): # Mask heads if we want to if head_mask is not None: - attn_prob = attn_prob * torch.einsum('ijbn->bnij', head_mask) + attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask) # attention output - attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, v_head_h) + attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h) if self.output_attentions: - return attn_vec, torch.einsum('bnij->ijbn', attn_prob) + return attn_vec, torch.einsum("bnij->ijbn", attn_prob) return attn_vec def post_attention(self, h, attn_vec, residual=True): """Post-attention processing.""" # post-attention projection (back to `d_model`) - attn_out = torch.einsum('ibnd,hnd->ibh', attn_vec, self.o) + attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o) attn_out = self.dropout(attn_out) if residual: @@ -305,10 +328,7 @@ class XLNetRelativeAttention(nn.Module): return output - def forward(self, h, g, - attn_mask_h, attn_mask_g, - r, seg_mat, - mems=None, target_mapping=None, head_mask=None): + def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None): if g is not None: ###### Two-stream attention with relative positional encoding. # content based attention score @@ -318,21 +338,22 @@ class XLNetRelativeAttention(nn.Module): cat = h # content-based key head - k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k) + k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k) # content-based value head - v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v) + v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v) # position-based key head - k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r) + k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r) ##### h-stream # content-stream query head - q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q) + q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q) # core attention ops attn_vec_h = self.rel_attn_core( - q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask) + q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask + ) if self.output_attentions: attn_vec_h, attn_prob_h = attn_vec_h @@ -342,21 +363,23 @@ class XLNetRelativeAttention(nn.Module): ##### g-stream # query-stream query head - q_head_g = torch.einsum('ibh,hnd->ibnd', g, self.q) + q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q) # core attention ops if target_mapping is not None: - q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) + q_head_g = torch.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( - q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask) + q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask + ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g - attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) + attn_vec_g = torch.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( - q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask) + q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask + ) if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g @@ -375,16 +398,17 @@ class XLNetRelativeAttention(nn.Module): cat = h # content heads - q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q) - k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k) - v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v) + q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q) + k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k) + v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v) # positional heads - k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r) + k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r) # core attention ops attn_vec = self.rel_attn_core( - q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask) + q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask + ) if self.output_attentions: attn_vec, attn_prob = attn_vec @@ -398,6 +422,7 @@ class XLNetRelativeAttention(nn.Module): outputs = outputs + (attn_prob,) return outputs + class XLNetFeedForward(nn.Module): def __init__(self, config): super(XLNetFeedForward, self).__init__() @@ -405,8 +430,9 @@ class XLNetFeedForward(nn.Module): self.layer_1 = nn.Linear(config.d_model, config.d_inner) self.layer_2 = nn.Linear(config.d_inner, config.d_model) self.dropout = nn.Dropout(config.dropout) - if isinstance(config.ff_activation, str) or \ - (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)): + if isinstance(config.ff_activation, str) or ( + sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode) + ): self.activation_function = ACT2FN[config.ff_activation] else: self.activation_function = config.ff_activation @@ -421,6 +447,7 @@ class XLNetFeedForward(nn.Module): output = self.layer_norm(output + inp) return output + class XLNetLayer(nn.Module): def __init__(self, config): super(XLNetLayer, self).__init__() @@ -428,12 +455,20 @@ class XLNetLayer(nn.Module): self.ff = XLNetFeedForward(config) self.dropout = nn.Dropout(config.dropout) - def forward(self, output_h, output_g, - attn_mask_h, attn_mask_g, - r, seg_mat, mems=None, target_mapping=None, head_mask=None): - outputs = self.rel_attn(output_h, output_g, attn_mask_h, attn_mask_g, - r, seg_mat, mems=mems, target_mapping=target_mapping, - head_mask=head_mask) + def forward( + self, output_h, output_g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None + ): + outputs = self.rel_attn( + output_h, + output_g, + attn_mask_h, + attn_mask_g, + r, + seg_mat, + mems=mems, + target_mapping=target_mapping, + head_mask=head_mask, + ) output_h, output_g = outputs[:2] if output_g is not None: @@ -448,6 +483,7 @@ class XLNetPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ + config_class = XLNetConfig pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_xlnet @@ -466,12 +502,20 @@ class XLNetPreTrainedModel(PreTrainedModel): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, XLNetRelativeAttention): - for param in [module.q, module.k, module.v, module.o, module.r, - module.r_r_bias, module.r_s_bias, module.r_w_bias, - module.seg_embed]: + for param in [ + module.q, + module.k, + module.v, + module.o, + module.r, + module.r_r_bias, + module.r_s_bias, + module.r_w_bias, + module.seg_embed, + ]: param.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, XLNetModel): - module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range) + module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range) XLNET_START_DOCSTRING = r""" The XLNet model was proposed in @@ -564,8 +608,12 @@ XLNET_INPUTS_DOCSTRING = r""" than the model's internal embedding lookup matrix. """ -@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.", - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + +@add_start_docstrings( + "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.", + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class XLNetModel(XLNetPreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -594,6 +642,7 @@ class XLNetModel(XLNetPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ + def __init__(self, config): super(XLNetModel, self).__init__(config) self.output_attentions = config.output_attentions @@ -658,18 +707,18 @@ class XLNetModel(XLNetPreTrainedModel): def cache_mem(self, curr_out, prev_mem): """cache hidden states into memory.""" if self.reuse_len is not None and self.reuse_len > 0: - curr_out = curr_out[:self.reuse_len] + curr_out = curr_out[: self.reuse_len] if prev_mem is None: - new_mem = curr_out[-self.mem_len:] + new_mem = curr_out[-self.mem_len :] else: - new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:] + new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len :] return new_mem.detach() @staticmethod def positional_embedding(pos_seq, inv_freq, bsz=None): - sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq) + sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq) pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1) pos_emb = pos_emb[:, None, :] @@ -683,14 +732,14 @@ class XLNetModel(XLNetPreTrainedModel): freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float) inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model)) - if self.attn_type == 'bi': + if self.attn_type == "bi": # beg, end = klen - 1, -qlen beg, end = klen, -qlen - elif self.attn_type == 'uni': + elif self.attn_type == "uni": # beg, end = klen - 1, -1 beg, end = klen, -1 else: - raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type)) + raise ValueError("Unknown `attn_type` {}.".format(self.attn_type)) if self.bi_data: fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float) @@ -701,8 +750,8 @@ class XLNetModel(XLNetPreTrainedModel): bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) if bsz is not None: - fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2) - bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2) + fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2) + bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2) else: fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) @@ -717,8 +766,18 @@ class XLNetModel(XLNetPreTrainedModel): pos_emb = pos_emb.to(next(self.parameters())) return pos_emb - def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None): + def forward( + self, + input_ids=None, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + ): # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end @@ -739,7 +798,6 @@ class XLNetModel(XLNetPreTrainedModel): perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None - mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen @@ -748,13 +806,13 @@ class XLNetModel(XLNetPreTrainedModel): ##### Attention mask # causal attention mask - if self.attn_type == 'uni': + if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = attn_mask[:, :, None, None] - elif self.attn_type == 'bi': + elif self.attn_type == "bi": attn_mask = None else: - raise ValueError('Unsupported attention type: {}'.format(self.attn_type)) + raise ValueError("Unsupported attention type: {}".format(self.attn_type)) # data mask: input mask & perm mask assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " @@ -799,9 +857,9 @@ class XLNetModel(XLNetPreTrainedModel): output_h = self.dropout(word_emb_k) if target_mapping is not None: word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1) - # else: # We removed the inp_q input which was same as target mapping - # inp_q_ext = inp_q[:, :, None] - # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k + # else: # We removed the inp_q input which was same as target mapping + # inp_q_ext = inp_q[:, :, None] + # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k output_g = self.dropout(word_emb_q) else: output_g = None @@ -836,7 +894,9 @@ class XLNetModel(XLNetPreTrainedModel): head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype + ) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer @@ -853,9 +913,17 @@ class XLNetModel(XLNetPreTrainedModel): if self.output_hidden_states: hidden_states.append((output_h, output_g) if output_g is not None else output_h) - outputs = layer_module(output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, - r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping, - head_mask=head_mask[i]) + outputs = layer_module( + output_h, + output_g, + attn_mask_h=non_tgt_mask, + attn_mask_g=attn_mask, + r=pos_emb, + seg_mat=seg_mat, + mems=mems[i], + target_mapping=target_mapping, + head_mask=head_mask[i], + ) output_h, output_g = outputs[:2] if self.output_attentions: attentions.append(outputs[2]) @@ -881,7 +949,9 @@ class XLNetModel(XLNetPreTrainedModel): if self.output_attentions: if target_mapping is not None: # when target_mapping is provided, there are 2-tuple of attentions - attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions) + attentions = tuple( + tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions + ) else: attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) outputs = outputs + (attentions,) @@ -889,9 +959,12 @@ class XLNetModel(XLNetPreTrainedModel): return outputs # outputs, (new_mems), (hidden_states), (attentions) -@add_start_docstrings("""XLNet Model with a language modeling head on top +@add_start_docstrings( + """XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings). """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class XLNetLMHeadModel(XLNetPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: @@ -934,6 +1007,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ + def __init__(self, config): super(XLNetLMHeadModel, self).__init__(config) self.attn_type = config.attn_type @@ -954,34 +1028,42 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): # Build permutation mask so that previous tokens don't see last token perm_mask = torch.zeros( - (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]), - dtype=torch.float, device=input_ids.device + (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=input_ids.device ) perm_mask[:, :, -1] = 1.0 # We'll only predict the last token target_mapping = torch.zeros( - (input_ids.shape[0], 1, input_ids.shape[1]), - dtype=torch.float, device=input_ids.device + (input_ids.shape[0], 1, input_ids.shape[1]), dtype=torch.float, device=input_ids.device ) target_mapping[0, 0, -1] = 1.0 - return {"input_ids": input_ids, - "perm_mask": perm_mask, - "target_mapping": target_mapping - } + return {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping} - def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - mems=mems, - perm_mask=perm_mask, - target_mapping=target_mapping, - token_type_ids=token_type_ids, - input_mask=input_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) logits = self.lm_loss(transformer_outputs[0]) @@ -990,16 +1072,18 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): if labels is not None: # Flatten the tokens loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, logits.size(-1)), - labels.view(-1)) + loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) outputs = (loss,) + outputs return outputs # return (loss), logits, (mems), (hidden states), (attentions) -@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of +@add_start_docstrings( + """XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class XLNetForSequenceClassification(XLNetPreTrainedModel): r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -1037,6 +1121,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): loss, logits = outputs[:2] """ + def __init__(self, config): super(XLNetForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels @@ -1047,17 +1132,30 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - mems=mems, - perm_mask=perm_mask, - target_mapping=target_mapping, - token_type_ids=token_type_ids, - input_mask=input_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) output = transformer_outputs[0] output = self.sequence_summary(output) @@ -1077,10 +1175,13 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): return outputs # return (loss), logits, (mems), (hidden states), (attentions) -@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of + +@add_start_docstrings( + """XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XLNET_START_DOCSTRING, - XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class XLNetForTokenClassification(XLNetPreTrainedModel): r""" Inputs: @@ -1135,6 +1236,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): scores = outputs[0] """ + def __init__(self, config): super(XLNetForTokenClassification, self).__init__(config) self.num_labels = config.num_labels @@ -1144,18 +1246,31 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None): + def forward( + self, + input_ids=None, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + ): - outputs = self.transformer(input_ids, - attention_mask=attention_mask, - mems=mems, - perm_mask=perm_mask, - target_mapping=target_mapping, - token_type_ids=token_type_ids, - input_mask=input_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] @@ -1177,9 +1292,12 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): return outputs # return (loss), logits, (mems), (hidden states), (attentions) -@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of +@add_start_docstrings( + """XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RACE/SWAG tasks. """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class XLNetForMultipleChoice(XLNetPreTrainedModel): r""" Inputs: @@ -1239,6 +1357,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): loss, classification_scores = outputs[:2] """ + def __init__(self, config): super(XLNetForMultipleChoice, self).__init__(config) @@ -1248,9 +1367,19 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None, - mems=None, perm_mask=None, target_mapping=None, - labels=None, head_mask=None, inputs_embeds=None): + def forward( + self, + input_ids=None, + token_type_ids=None, + input_mask=None, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + labels=None, + head_mask=None, + inputs_embeds=None, + ): num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) @@ -1258,18 +1387,26 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None - transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids, - input_mask=flat_input_mask, attention_mask=flat_attention_mask, - mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, - head_mask=head_mask, inputs_embeds=inputs_embeds) - + transformer_outputs = self.transformer( + flat_input_ids, + token_type_ids=flat_token_type_ids, + input_mask=flat_input_mask, + attention_mask=flat_attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) output = transformer_outputs[0] output = self.sequence_summary(output) logits = self.logits_proj(output) reshaped_logits = logits.view(-1, num_choices) - outputs = (reshaped_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + outputs = (reshaped_logits,) + transformer_outputs[ + 1: + ] # Keep mems, hidden states, attentions if there are in it if labels is not None: loss_fct = CrossEntropyLoss() @@ -1279,9 +1416,12 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): return outputs # return (loss), logits, (mems), (hidden states), (attentions) -@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -1325,6 +1465,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): loss, start_scores, end_scores = outputs[:2] """ + def __init__(self, config): super(XLNetForQuestionAnsweringSimple, self).__init__(config) self.num_labels = config.num_labels @@ -1334,19 +1475,32 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, - start_positions=None, end_positions=None): + def forward( + self, + input_ids=None, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + ): - outputs = self.transformer(input_ids, - attention_mask=attention_mask, - mems=mems, - perm_mask=perm_mask, - target_mapping=target_mapping, - token_type_ids=token_type_ids, - input_mask=input_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) sequence_output = outputs[0] @@ -1376,9 +1530,12 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) -@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +@add_start_docstrings( + """XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) + XLNET_START_DOCSTRING, + XLNET_INPUTS_DOCSTRING, +) class XLNetForQuestionAnswering(XLNetPreTrainedModel): r""" **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: @@ -1440,6 +1597,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): loss, start_scores, end_scores = outputs[:2] """ + def __init__(self, config): super(XLNetForQuestionAnswering, self).__init__(config) self.start_n_top = config.start_n_top @@ -1452,18 +1610,34 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): self.init_weights() - def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, - start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,): - transformer_outputs = self.transformer(input_ids, - attention_mask=attention_mask, - mems=mems, - perm_mask=perm_mask, - target_mapping=target_mapping, - token_type_ids=token_type_ids, - input_mask=input_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds) + def forward( + self, + input_ids=None, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + is_impossible=None, + cls_index=None, + p_mask=None, + ): + transformer_outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + ) hidden_states = transformer_outputs[0] start_logits = self.start_logits(hidden_states, p_mask=p_mask) @@ -1497,24 +1671,34 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() - start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) + start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) - start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top) - start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) - start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) - start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) + start_top_log_probs, start_top_index = torch.topk( + start_log_probs, self.start_n_top, dim=-1 + ) # shape (bsz, start_n_top) + start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) + start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) + start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) - hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz) + hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( + start_states + ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) - end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) - end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top) + end_top_log_probs, end_top_index = torch.topk( + end_log_probs, self.end_n_top, dim=1 + ) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) - start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) # get the representation of START as weighted sum of hidden states - cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) # Shape (batch size,): one single `cls_logits` for each sample + start_states = torch.einsum( + "blh,bl->bh", hidden_states, start_log_probs + ) # get the representation of START as weighted sum of hidden states + cls_logits = self.answer_class( + hidden_states, start_states=start_states, cls_index=cls_index + ) # Shape (batch size,): one single `cls_logits` for each sample outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs diff --git a/transformers/optimization.py b/transformers/optimization.py index 99e6cc75e4..0cd57078ba 100644 --- a/transformers/optimization.py +++ b/transformers/optimization.py @@ -34,10 +34,11 @@ def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1 """ Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate increases linearly between 0 and 1. """ + def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1.0, num_warmup_steps)) - return 1. + return 1.0 return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) @@ -46,40 +47,47 @@ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st """ Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period. """ + def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) - return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) + return max( + 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) + ) return LambdaLR(optimizer, lr_lambda, last_epoch) -def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1): +def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=0.5, last_epoch=-1): """ Create a schedule with a learning rate that decreases following the values of the cosine function between 0 and `pi * cycles` after a warmup period during which it increases linearly between 0 and 1. """ + def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) - return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress))) + return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) return LambdaLR(optimizer, lr_lambda, last_epoch) -def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1): +def get_cosine_with_hard_restarts_schedule_with_warmup( + optimizer, num_warmup_steps, num_training_steps, num_cycles=1.0, last_epoch=-1 +): """ Create a schedule with a learning rate that decreases following the values of the cosine function with several hard restarts, after a warmup period during which it increases linearly between 0 and 1. """ + def lr_lambda(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) - if progress >= 1.: - return 0. - return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.)))) + if progress >= 1.0: + return 0.0 + return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) return LambdaLR(optimizer, lr_lambda, last_epoch) @@ -94,17 +102,17 @@ class AdamW(Optimizer): weight_decay (float): Weight decay. Default: 0.0 correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. """ + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): if lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: + if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, - correct_bias=correct_bias) + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) super(AdamW, self).__init__(params, defaults) def step(self, closure=None): @@ -119,38 +127,38 @@ class AdamW(Optimizer): loss = closure() for group in self.param_groups: - for p in group['params']: + for p in group["params"]: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead") state = self.state[p] # State initialization if len(state) == 0: - state['step'] = 0 + state["step"] = 0 # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) + state["exp_avg"] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) + state["exp_avg_sq"] = torch.zeros_like(p.data) - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + beta1, beta2 = group["betas"] - state['step'] += 1 + state["step"] += 1 # Decay the first and second moment running average coefficient # In-place operations to update the averages at the same time exp_avg.mul_(beta1).add_(1.0 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) - denom = exp_avg_sq.sqrt().add_(group['eps']) + denom = exp_avg_sq.sqrt().add_(group["eps"]) - step_size = group['lr'] - if group['correct_bias']: # No bias correction for Bert - bias_correction1 = 1.0 - beta1 ** state['step'] - bias_correction2 = 1.0 - beta2 ** state['step'] + step_size = group["lr"] + if group["correct_bias"]: # No bias correction for Bert + bias_correction1 = 1.0 - beta1 ** state["step"] + bias_correction2 = 1.0 - beta2 ** state["step"] step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) @@ -163,7 +171,7 @@ class AdamW(Optimizer): # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. # Add weight decay at the end (fixed version) - if group['weight_decay'] > 0.0: - p.data.add_(-group['lr'] * group['weight_decay'], p.data) + if group["weight_decay"] > 0.0: + p.data.add_(-group["lr"] * group["weight_decay"], p.data) return loss diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py index c5fa248083..bdcbd323ce 100644 --- a/transformers/optimization_tf.py +++ b/transformers/optimization_tf.py @@ -24,70 +24,64 @@ import tensorflow as tf class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): - """Applys a warmup schedule on a given learning rate decay schedule.""" + """Applys a warmup schedule on a given learning rate decay schedule.""" - def __init__( - self, - initial_learning_rate, - decay_schedule_fn, - warmup_steps, - power=1.0, - name=None): - super(WarmUp, self).__init__() - self.initial_learning_rate = initial_learning_rate - self.warmup_steps = warmup_steps - self.power = power - self.decay_schedule_fn = decay_schedule_fn - self.name = name + def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None): + super(WarmUp, self).__init__() + self.initial_learning_rate = initial_learning_rate + self.warmup_steps = warmup_steps + self.power = power + self.decay_schedule_fn = decay_schedule_fn + self.name = name - def __call__(self, step): - with tf.name_scope(self.name or 'WarmUp') as name: - # Implements polynomial warmup. i.e., if global_step < warmup_steps, the - # learning rate will be `global_step/num_warmup_steps * init_lr`. - global_step_float = tf.cast(step, tf.float32) - warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) - warmup_percent_done = global_step_float / warmup_steps_float - warmup_learning_rate = ( - self.initial_learning_rate * - tf.math.pow(warmup_percent_done, self.power)) - return tf.cond(global_step_float < warmup_steps_float, - lambda: warmup_learning_rate, - lambda: self.decay_schedule_fn(step), - name=name) + def __call__(self, step): + with tf.name_scope(self.name or "WarmUp") as name: + # Implements polynomial warmup. i.e., if global_step < warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + global_step_float = tf.cast(step, tf.float32) + warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) + warmup_percent_done = global_step_float / warmup_steps_float + warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power) + return tf.cond( + global_step_float < warmup_steps_float, + lambda: warmup_learning_rate, + lambda: self.decay_schedule_fn(step), + name=name, + ) - def get_config(self): - return { - 'initial_learning_rate': self.initial_learning_rate, - 'decay_schedule_fn': self.decay_schedule_fn, - 'warmup_steps': self.warmup_steps, - 'power': self.power, - 'name': self.name - } + def get_config(self): + return { + "initial_learning_rate": self.initial_learning_rate, + "decay_schedule_fn": self.decay_schedule_fn, + "warmup_steps": self.warmup_steps, + "power": self.power, + "name": self.name, + } def create_optimizer(init_lr, num_train_steps, num_warmup_steps): - """Creates an optimizer with learning rate schedule.""" - # Implements linear decay of the learning rate. - learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( - initial_learning_rate=init_lr, - decay_steps=num_train_steps, - end_learning_rate=0.0) - if num_warmup_steps: - learning_rate_fn = WarmUp(initial_learning_rate=init_lr, - decay_schedule_fn=learning_rate_fn, - warmup_steps=num_warmup_steps) - optimizer = AdamWeightDecay( - learning_rate=learning_rate_fn, - weight_decay_rate=0.01, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-6, - exclude_from_weight_decay=['layer_norm', 'bias']) - return optimizer + """Creates an optimizer with learning rate schedule.""" + # Implements linear decay of the learning rate. + learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( + initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0 + ) + if num_warmup_steps: + learning_rate_fn = WarmUp( + initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps + ) + optimizer = AdamWeightDecay( + learning_rate=learning_rate_fn, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["layer_norm", "bias"], + ) + return optimizer class AdamWeightDecay(tf.keras.optimizers.Adam): - """Adam enables L2 weight decay and clip_by_global_norm on gradients. + """Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will @@ -98,99 +92,94 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): the loss with plain (non-momentum) SGD. """ - def __init__(self, - learning_rate=0.001, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-7, - amsgrad=False, - weight_decay_rate=0.0, - include_in_weight_decay=None, - exclude_from_weight_decay=None, - name='AdamWeightDecay', - **kwargs): - super(AdamWeightDecay, self).__init__( - learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) - self.weight_decay_rate = weight_decay_rate - self._include_in_weight_decay = include_in_weight_decay - self._exclude_from_weight_decay = exclude_from_weight_decay + def __init__( + self, + learning_rate=0.001, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-7, + amsgrad=False, + weight_decay_rate=0.0, + include_in_weight_decay=None, + exclude_from_weight_decay=None, + name="AdamWeightDecay", + **kwargs + ): + super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) + self.weight_decay_rate = weight_decay_rate + self._include_in_weight_decay = include_in_weight_decay + self._exclude_from_weight_decay = exclude_from_weight_decay - @classmethod - def from_config(cls, config): - """Creates an optimizer from its config with WarmUp custom object.""" - custom_objects = {'WarmUp': WarmUp} - return super(AdamWeightDecay, cls).from_config( - config, custom_objects=custom_objects) + @classmethod + def from_config(cls, config): + """Creates an optimizer from its config with WarmUp custom object.""" + custom_objects = {"WarmUp": WarmUp} + return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects) - def _prepare_local(self, var_device, var_dtype, apply_state): - super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, - apply_state) - apply_state['weight_decay_rate'] = tf.constant( - self.weight_decay_rate, name='adam_weight_decay_rate') + def _prepare_local(self, var_device, var_dtype, apply_state): + super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state) + apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate") - def _decay_weights_op(self, var, learning_rate, apply_state): - do_decay = self._do_use_weight_decay(var.name) - if do_decay: - return var.assign_sub( - learning_rate * var * - apply_state['weight_decay_rate'], - use_locking=self._use_locking) - return tf.no_op() + def _decay_weights_op(self, var, learning_rate, apply_state): + do_decay = self._do_use_weight_decay(var.name) + if do_decay: + return var.assign_sub( + learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking + ) + return tf.no_op() - def apply_gradients(self, grads_and_vars, clip_norm, name=None): - grads, tvars = list(zip(*grads_and_vars)) - (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm) - return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars)) + def apply_gradients(self, grads_and_vars, clip_norm, name=None): + grads, tvars = list(zip(*grads_and_vars)) + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm) + return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars)) - def _get_lr(self, var_device, var_dtype, apply_state): - """Retrieves the learning rate with the given state.""" - if apply_state is None: - return self._decayed_lr_t[var_dtype], {} + def _get_lr(self, var_device, var_dtype, apply_state): + """Retrieves the learning rate with the given state.""" + if apply_state is None: + return self._decayed_lr_t[var_dtype], {} - apply_state = apply_state or {} - coefficients = apply_state.get((var_device, var_dtype)) - if coefficients is None: - coefficients = self._fallback_apply_state(var_device, var_dtype) - apply_state[(var_device, var_dtype)] = coefficients + apply_state = apply_state or {} + coefficients = apply_state.get((var_device, var_dtype)) + if coefficients is None: + coefficients = self._fallback_apply_state(var_device, var_dtype) + apply_state[(var_device, var_dtype)] = coefficients - return coefficients['lr_t'], dict(apply_state=apply_state) + return coefficients["lr_t"], dict(apply_state=apply_state) - def _resource_apply_dense(self, grad, var, apply_state=None): - lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) - decay = self._decay_weights_op(var, lr_t, apply_state) - with tf.control_dependencies([decay]): - return super(AdamWeightDecay, self)._resource_apply_dense( - grad, var, **kwargs) + def _resource_apply_dense(self, grad, var, apply_state=None): + lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) + decay = self._decay_weights_op(var, lr_t, apply_state) + with tf.control_dependencies([decay]): + return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs) - def _resource_apply_sparse(self, grad, var, indices, apply_state=None): - lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) - decay = self._decay_weights_op(var, lr_t, apply_state) - with tf.control_dependencies([decay]): - return super(AdamWeightDecay, self)._resource_apply_sparse( - grad, var, indices, **kwargs) + def _resource_apply_sparse(self, grad, var, indices, apply_state=None): + lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) + decay = self._decay_weights_op(var, lr_t, apply_state) + with tf.control_dependencies([decay]): + return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs) - def get_config(self): - config = super(AdamWeightDecay, self).get_config() - config.update({ - 'weight_decay_rate': self.weight_decay_rate, - }) - return config + def get_config(self): + config = super(AdamWeightDecay, self).get_config() + config.update( + {"weight_decay_rate": self.weight_decay_rate,} + ) + return config - def _do_use_weight_decay(self, param_name): - """Whether to use L2 weight decay for `param_name`.""" - if self.weight_decay_rate == 0: - return False + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if self.weight_decay_rate == 0: + return False - if self._include_in_weight_decay: - for r in self._include_in_weight_decay: - if re.search(r, param_name) is not None: - return True + if self._include_in_weight_decay: + for r in self._include_in_weight_decay: + if re.search(r, param_name) is not None: + return True - if self._exclude_from_weight_decay: - for r in self._exclude_from_weight_decay: - if re.search(r, param_name) is not None: - return False - return True + if self._exclude_from_weight_decay: + for r in self._exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True ## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py @@ -201,10 +190,8 @@ class GradientAccumulator(object): """Initializes the accumulator.""" self._gradients = [] self._accum_steps = tf.Variable( - initial_value=0, - dtype=tf.int64, - trainable=False, - aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA) + initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA + ) @property def step(self): @@ -214,12 +201,19 @@ class GradientAccumulator(object): @property def gradients(self): """The accumulated gradients.""" - return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()) + return list( + gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients() + ) def __call__(self, gradients): """Accumulates :obj:`gradients`.""" if not self._gradients: - self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients]) + self._gradients.extend( + [ + tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient + for gradient in gradients + ] + ) if len(gradients) != len(self._gradients): raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients))) @@ -249,6 +243,9 @@ class GradientAccumulator(object): if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1: return self._gradients - return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients) + return ( + gradient.device_map.select_for_current_replica(gradient.values, replica_context) + for gradient in self._gradients + ) else: return self._gradients diff --git a/transformers/pipelines.py b/transformers/pipelines.py index f4bf3da685..4149c2e475 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -30,25 +30,42 @@ from typing import Union, Optional, Tuple, List, Dict import numpy as np -from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer, - PretrainedConfig, ModelCard, SquadExample, - squad_convert_examples_to_features, is_tf_available, - is_torch_available, BasicTokenizer, - ALL_PRETRAINED_CONFIG_ARCHIVE_MAP) +from transformers import ( + AutoConfig, + AutoTokenizer, + PreTrainedTokenizer, + PretrainedConfig, + ModelCard, + SquadExample, + squad_convert_examples_to_features, + is_tf_available, + is_torch_available, + BasicTokenizer, + ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, +) if is_tf_available(): import tensorflow as tf - from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \ - TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification + from transformers import ( + TFAutoModel, + TFAutoModelForSequenceClassification, + TFAutoModelForQuestionAnswering, + TFAutoModelForTokenClassification, + ) if is_torch_available(): import torch - from transformers import AutoModel, AutoModelForSequenceClassification, \ - AutoModelForQuestionAnswering, AutoModelForTokenClassification + from transformers import ( + AutoModel, + AutoModelForSequenceClassification, + AutoModelForQuestionAnswering, + AutoModelForTokenClassification, + ) logger = logging.getLogger(__name__) + def get_framework(model=None): """ Select framework (TensorFlow/PyTorch) to use. If both frameworks are installed and no specific model is provided, defaults to using PyTorch. @@ -56,20 +73,24 @@ def get_framework(model=None): if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): # Both framework are available but the use supplied a model class instance. # Try to guess which framework to use from the model classname - framework = 'tf' if model.__class__.__name__.startswith('TF') else 'pt' + framework = "tf" if model.__class__.__name__.startswith("TF") else "pt" elif not is_tf_available() and not is_torch_available(): - raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. " - "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " - "To install PyTorch, read the instructions at https://pytorch.org/.") + raise ImportError( + "At least one of TensorFlow 2.0 or PyTorch should be installed. " + "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " + "To install PyTorch, read the instructions at https://pytorch.org/." + ) else: # framework = 'tf' if is_tf_available() else 'pt' - framework = 'pt' if is_torch_available() else 'tf' + framework = "pt" if is_torch_available() else "tf" return framework + class ArgumentHandler(ABC): """ Base interface for handling varargs for each Pipeline """ + @abstractmethod def __call__(self, *args, **kwargs): raise NotImplementedError() @@ -79,11 +100,12 @@ class DefaultArgumentHandler(ArgumentHandler): """ Default varargs argument parser handling parameters for each Pipeline """ + def __call__(self, *args, **kwargs): - if 'X' in kwargs: - return kwargs['X'] - elif 'data' in kwargs: - return kwargs['data'] + if "X" in kwargs: + return kwargs["X"] + elif "data" in kwargs: + return kwargs["data"] elif len(args) == 1: if isinstance(args[0], list): return args[0] @@ -91,7 +113,7 @@ class DefaultArgumentHandler(ArgumentHandler): return [args[0]] elif len(args) > 1: return list(args) - raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)') + raise ValueError("Unable to infer the format of the provided data (X=, data=, ...)") class PipelineDataFormat: @@ -105,24 +127,25 @@ class PipelineDataFormat: PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. """ - SUPPORTED_FORMATS = ['json', 'csv', 'pipe'] + + SUPPORTED_FORMATS = ["json", "csv", "pipe"] def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False): self.output_path = output_path self.input_path = input_path - self.column = column.split(',') if column is not None else [''] + self.column = column.split(",") if column is not None else [""] self.is_multi_columns = len(self.column) > 1 if self.is_multi_columns: - self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column] + self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column] if output_path is not None and not overwrite: if exists(abspath(self.output_path)): - raise OSError('{} already exists on disk'.format(self.output_path)) + raise OSError("{} already exists on disk".format(self.output_path)) if input_path is not None: if not exists(abspath(self.input_path)): - raise OSError('{} doesnt exist on disk'.format(self.input_path)) + raise OSError("{} doesnt exist on disk".format(self.input_path)) @abstractmethod def __iter__(self): @@ -144,23 +167,25 @@ class PipelineDataFormat: :return: (str) Path where the data has been saved """ path, _ = os.path.splitext(self.output_path) - binary_path = os.path.extsep.join((path, 'pickle')) + binary_path = os.path.extsep.join((path, "pickle")) - with open(binary_path, 'wb+') as f_output: + with open(binary_path, "wb+") as f_output: pickle.dump(data, f_output) return binary_path @staticmethod - def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False): - if format == 'json': + def from_str( + format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False + ): + if format == "json": return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) - elif format == 'csv': + elif format == "csv": return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) - elif format == 'pipe': + elif format == "pipe": return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) else: - raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format)) + raise KeyError("Unknown reader {} (Available reader are json/csv/pipe)".format(format)) class CsvPipelineDataFormat(PipelineDataFormat): @@ -168,7 +193,7 @@ class CsvPipelineDataFormat(PipelineDataFormat): super().__init__(output_path, input_path, column, overwrite=overwrite) def __iter__(self): - with open(self.input_path, 'r') as f: + with open(self.input_path, "r") as f: reader = csv.DictReader(f) for row in reader: if self.is_multi_columns: @@ -177,7 +202,7 @@ class CsvPipelineDataFormat(PipelineDataFormat): yield row[self.column[0]] def save(self, data: List[dict]): - with open(self.output_path, 'w') as f: + with open(self.output_path, "w") as f: if len(data) > 0: writer = csv.DictWriter(f, list(data[0].keys())) writer.writeheader() @@ -188,7 +213,7 @@ class JsonPipelineDataFormat(PipelineDataFormat): def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False): super().__init__(output_path, input_path, column, overwrite=overwrite) - with open(input_path, 'r') as f: + with open(input_path, "r") as f: self._entries = json.load(f) def __iter__(self): @@ -199,7 +224,7 @@ class JsonPipelineDataFormat(PipelineDataFormat): yield entry[self.column[0]] def save(self, data: dict): - with open(self.output_path, 'w') as f: + with open(self.output_path, "w") as f: json.dump(data, f) @@ -210,12 +235,13 @@ class PipedPipelineDataFormat(PipelineDataFormat): If columns are provided, then the output will be a dictionary with {column_x: value_x} """ + def __iter__(self): for line in sys.stdin: # Split for multi-columns - if '\t' in line: + if "\t" in line: - line = line.split('\t') + line = line.split("\t") if self.column: # Dictionary to map arguments yield {kwargs: l for (kwargs, _), l in zip(self.column, line)} @@ -232,8 +258,8 @@ class PipedPipelineDataFormat(PipelineDataFormat): def save_binary(self, data: Union[dict, List[dict]]) -> str: if self.output_path is None: raise KeyError( - 'When using piped input on pipeline outputting large object requires an output file path. ' - 'Please provide such output path through --output argument.' + "When using piped input on pipeline outputting large object requires an output file path. " + "Please provide such output path through --output argument." ) return super().save_binary(data) @@ -298,10 +324,16 @@ class Pipeline(_ScikitCompat): default_input_names = None - def __init__(self, model, tokenizer: PreTrainedTokenizer = None, - modelcard: ModelCard = None, framework: Optional[str] = None, - args_parser: ArgumentHandler = None, device: int = -1, - binary_output: bool = False): + def __init__( + self, + model, + tokenizer: PreTrainedTokenizer = None, + modelcard: ModelCard = None, + framework: Optional[str] = None, + args_parser: ArgumentHandler = None, + device: int = -1, + binary_output: bool = False, + ): if framework is None: framework = get_framework() @@ -315,8 +347,8 @@ class Pipeline(_ScikitCompat): self._args_parser = args_parser or DefaultArgumentHandler() # Special handling - if self.device >= 0 and self.framework == 'pt': - self.model = self.model.to('cuda:{}'.format(self.device)) + if self.device >= 0 and self.framework == "pt": + self.model = self.model.to("cuda:{}".format(self.device)) def save_pretrained(self, save_directory): """ @@ -356,8 +388,8 @@ class Pipeline(_ScikitCompat): Returns: Context manager """ - if self.framework == 'tf': - with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)): + if self.framework == "tf": + with tf.device("/CPU:0" if self.device == -1 else "/device:GPU:{}".format(self.device)): yield else: if self.device >= 0: @@ -372,11 +404,11 @@ class Pipeline(_ScikitCompat): Returns: dict holding all the required parameters for model's forward """ - args = ['input_ids', 'attention_mask'] + args = ["input_ids", "attention_mask"] model_type = type(self.model).__name__.lower() - if 'distilbert' not in model_type and 'xlm' not in model_type: - args += ['token_type_ids'] + if "distilbert" not in model_type and "xlm" not in model_type: + args += ["token_type_ids"] # PR #1548 (CLI) There is an issue with attention_mask # if 'xlnet' in model_type or 'xlm' in model_type: @@ -394,9 +426,7 @@ class Pipeline(_ScikitCompat): # Encode for forward with self.device_placement(): inputs = self.tokenizer.batch_encode_plus( - inputs, add_special_tokens=True, - return_tensors=self.framework, - max_length=self.tokenizer.max_len + inputs, add_special_tokens=True, return_tensors=self.framework, max_length=self.tokenizer.max_len ) # Filter out features not available on specific models @@ -411,7 +441,7 @@ class Pipeline(_ScikitCompat): Returns: Numpy array """ - if self.framework == 'tf': + if self.framework == "tf": # TODO trace model predictions = self.model(inputs, training=False)[0] else: @@ -426,19 +456,24 @@ class FeatureExtractionPipeline(Pipeline): Feature extraction pipeline using Model head. """ - def __init__(self, model, - tokenizer: PreTrainedTokenizer = None, - modelcard: ModelCard = None, - framework: Optional[str] = None, - args_parser: ArgumentHandler = None, - device: int = -1): - super().__init__(model=model, - tokenizer=tokenizer, - modelcard=modelcard, - framework=framework, - args_parser=args_parser, - device=device, - binary_output=True) + def __init__( + self, + model, + tokenizer: PreTrainedTokenizer = None, + modelcard: ModelCard = None, + framework: Optional[str] = None, + args_parser: ArgumentHandler = None, + device: int = -1, + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=True, + ) def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs).tolist() @@ -452,7 +487,7 @@ class TextClassificationPipeline(Pipeline): def __call__(self, *args, **kwargs): outputs = super().__call__(*args, **kwargs) scores = np.exp(outputs) / np.exp(outputs).sum(-1) - return [{'label': self.model.config.id2label[item.argmax()], 'score': item.max()} for item in scores] + return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores] class NerPipeline(Pipeline): @@ -460,19 +495,28 @@ class NerPipeline(Pipeline): Named Entity Recognition pipeline using ModelForTokenClassification head. """ - default_input_names = 'sequences' + default_input_names = "sequences" - def __init__(self, model, tokenizer: PreTrainedTokenizer = None, - modelcard: ModelCard = None, framework: Optional[str] = None, - args_parser: ArgumentHandler = None, device: int = -1, - binary_output: bool = False, ignore_labels=['O']): - super().__init__(model=model, - tokenizer=tokenizer, - modelcard=modelcard, - framework=framework, - args_parser=args_parser, - device=device, - binary_output=binary_output) + def __init__( + self, + model, + tokenizer: PreTrainedTokenizer = None, + modelcard: ModelCard = None, + framework: Optional[str] = None, + args_parser: ArgumentHandler = None, + device: int = -1, + binary_output: bool = False, + ignore_labels=["O"], + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=binary_output, + ) self._basic_tokenizer = BasicTokenizer(do_lower_case=False) self.ignore_labels = ignore_labels @@ -485,19 +529,20 @@ class NerPipeline(Pipeline): with self.device_placement(): tokens = self.tokenizer.encode_plus( - sentence, return_attention_mask=False, + sentence, + return_attention_mask=False, return_tensors=self.framework, - max_length=self.tokenizer.max_len + max_length=self.tokenizer.max_len, ) # Forward - if self.framework == 'tf': + if self.framework == "tf": entities = self.model(tokens)[0][0].numpy() - input_ids = tokens['input_ids'].numpy()[0] + input_ids = tokens["input_ids"].numpy()[0] else: with torch.no_grad(): entities = self.model(**tokens)[0][0].cpu().numpy() - input_ids = tokens['input_ids'].cpu().numpy()[0] + input_ids = tokens["input_ids"].cpu().numpy()[0] score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) labels_idx = score.argmax(axis=-1) @@ -505,11 +550,13 @@ class NerPipeline(Pipeline): answer = [] for idx, label_idx in enumerate(labels_idx): if self.model.config.id2label[label_idx] not in self.ignore_labels: - answer += [{ - 'word': self.tokenizer.decode([int(input_ids[idx])]), - 'score': score[idx][label_idx].item(), - 'entity': self.model.config.id2label[label_idx] - }] + answer += [ + { + "word": self.tokenizer.decode([int(input_ids[idx])]), + "score": score[idx][label_idx].item(), + "entity": self.model.config.id2label[label_idx], + } + ] # Append answers += [answer] @@ -526,18 +573,19 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler): QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied arguments. """ + def __call__(self, *args, **kwargs): # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating if args is not None and len(args) > 0: if len(args) == 1: - kwargs['X'] = args[0] + kwargs["X"] = args[0] else: - kwargs['X'] = list(args) + kwargs["X"] = list(args) # Generic compatibility with sklearn and Keras # Batched data - if 'X' in kwargs or 'data' in kwargs: - inputs = kwargs['X'] if 'X' in kwargs else kwargs['data'] + if "X" in kwargs or "data" in kwargs: + inputs = kwargs["X"] if "X" in kwargs else kwargs["data"] if isinstance(inputs, dict): inputs = [inputs] @@ -547,28 +595,31 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler): for i, item in enumerate(inputs): if isinstance(item, dict): - if any(k not in item for k in ['question', 'context']): - raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') + if any(k not in item for k in ["question", "context"]): + raise KeyError("You need to provide a dictionary with keys {question:..., context:...}") inputs[i] = QuestionAnsweringPipeline.create_sample(**item) elif not isinstance(item, SquadExample): raise ValueError( - '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' - .format('X' if 'X' in kwargs else 'data') + "{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)".format( + "X" if "X" in kwargs else "data" + ) ) # Tabular input - elif 'question' in kwargs and 'context' in kwargs: - if isinstance(kwargs['question'], str): - kwargs['question'] = [kwargs['question']] + elif "question" in kwargs and "context" in kwargs: + if isinstance(kwargs["question"], str): + kwargs["question"] = [kwargs["question"]] - if isinstance(kwargs['context'], str): - kwargs['context'] = [kwargs['context']] + if isinstance(kwargs["context"], str): + kwargs["context"] = [kwargs["context"]] - inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] + inputs = [ + QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs["question"], kwargs["context"]) + ] else: - raise ValueError('Unknown arguments {}'.format(kwargs)) + raise ValueError("Unknown arguments {}".format(kwargs)) if not isinstance(inputs, list): inputs = [inputs] @@ -581,22 +632,31 @@ class QuestionAnsweringPipeline(Pipeline): Question Answering pipeline using ModelForQuestionAnswering head. """ - default_input_names = 'question,context' + default_input_names = "question,context" - def __init__(self, model, - tokenizer: Optional[PreTrainedTokenizer], - modelcard: Optional[ModelCard], - framework: Optional[str] = None, - device: int = -1, **kwargs): - super().__init__(model=model, - tokenizer=tokenizer, - modelcard=modelcard, - framework=framework, - args_parser=QuestionAnsweringArgumentHandler(), - device=device, **kwargs) + def __init__( + self, + model, + tokenizer: Optional[PreTrainedTokenizer], + modelcard: Optional[ModelCard], + framework: Optional[str] = None, + device: int = -1, + **kwargs + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=QuestionAnsweringArgumentHandler(), + device=device, + **kwargs + ) @staticmethod - def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: + def create_sample( + question: Union[str, List[str]], context: Union[str, List[str]] + ) -> Union[SquadExample, List[SquadExample]]: """ QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). @@ -629,26 +689,28 @@ class QuestionAnsweringPipeline(Pipeline): end: the character index in the original string corresponding to the ending of the answer' span """ # Set defaults values - kwargs.setdefault('topk', 1) - kwargs.setdefault('doc_stride', 128) - kwargs.setdefault('max_answer_len', 15) - kwargs.setdefault('max_seq_len', 384) - kwargs.setdefault('max_question_len', 64) + kwargs.setdefault("topk", 1) + kwargs.setdefault("doc_stride", 128) + kwargs.setdefault("max_answer_len", 15) + kwargs.setdefault("max_seq_len", 384) + kwargs.setdefault("max_question_len", 64) - if kwargs['topk'] < 1: - raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk'])) + if kwargs["topk"] < 1: + raise ValueError("topk parameter should be >= 1 (got {})".format(kwargs["topk"])) - if kwargs['max_answer_len'] < 1: - raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len'])) + if kwargs["max_answer_len"] < 1: + raise ValueError("max_answer_len parameter should be >= 1 (got {})".format(kwargs["max_answer_len"])) # Convert inputs to features examples = self._args_parser(*texts, **kwargs) - features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) + features = squad_convert_examples_to_features( + examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False + ) fw_args = self.inputs_for_model([f.__dict__ for f in features]) # Manage tensor allocation on correct device with self.device_placement(): - if self.framework == 'tf': + if self.framework == "tf": fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} start, end = self.model(fw_args) start, end = start.numpy(), end.numpy() @@ -672,16 +734,18 @@ class QuestionAnsweringPipeline(Pipeline): # Mask CLS start_[0] = end_[0] = 0 - starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len']) + starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"]) char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text answers += [ { - 'score': score.item(), - 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), - 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), - 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]:feature.token_to_orig_map[e] + 1]) + "score": score.item(), + "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), + "answer": " ".join( + example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1] + ), } for s, e, score in zip(starts, ends, scores) ] @@ -767,71 +831,71 @@ class QuestionAnsweringPipeline(Pipeline): chars_idx += len(word) + 1 # Join text with spaces - return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)} + return {"answer": " ".join(words), "start": max(0, char_start_idx), "end": min(len(text), char_end_idx)} # Register all the supported task here SUPPORTED_TASKS = { - 'feature-extraction': { - 'impl': FeatureExtractionPipeline, - 'tf': TFAutoModel if is_tf_available() else None, - 'pt': AutoModel if is_torch_available() else None, - 'default': { - 'model': { - 'pt': 'distilbert-base-uncased', - 'tf': 'distilbert-base-uncased', - }, - 'config': None, - 'tokenizer': 'distilbert-base-uncased' - } + "feature-extraction": { + "impl": FeatureExtractionPipeline, + "tf": TFAutoModel if is_tf_available() else None, + "pt": AutoModel if is_torch_available() else None, + "default": { + "model": {"pt": "distilbert-base-uncased", "tf": "distilbert-base-uncased",}, + "config": None, + "tokenizer": "distilbert-base-uncased", + }, }, - 'sentiment-analysis': { - 'impl': TextClassificationPipeline, - 'tf': TFAutoModelForSequenceClassification if is_tf_available() else None, - 'pt': AutoModelForSequenceClassification if is_torch_available() else None, - 'default': { - 'model': { - 'pt': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin', - 'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5', + "sentiment-analysis": { + "impl": TextClassificationPipeline, + "tf": TFAutoModelForSequenceClassification if is_tf_available() else None, + "pt": AutoModelForSequenceClassification if is_torch_available() else None, + "default": { + "model": { + "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin", + "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5", }, - 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json', - 'tokenizer': 'distilbert-base-uncased' - } + "config": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", + "tokenizer": "distilbert-base-uncased", + }, }, - 'ner': { - 'impl': NerPipeline, - 'tf': TFAutoModelForTokenClassification if is_tf_available() else None, - 'pt': AutoModelForTokenClassification if is_torch_available() else None, - 'default': { - 'model': { - 'pt':'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin', - 'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5', + "ner": { + "impl": NerPipeline, + "tf": TFAutoModelForTokenClassification if is_tf_available() else None, + "pt": AutoModelForTokenClassification if is_torch_available() else None, + "default": { + "model": { + "pt": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin", + "tf": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5", }, - 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json', - 'tokenizer': 'bert-large-cased' - } + "config": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json", + "tokenizer": "bert-large-cased", + }, }, - 'question-answering': { - 'impl': QuestionAnsweringPipeline, - 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, - 'pt': AutoModelForQuestionAnswering if is_torch_available() else None, - 'default': { - 'model': { - 'pt': 'distilbert-base-uncased-distilled-squad', - 'tf': 'distilbert-base-uncased-distilled-squad', + "question-answering": { + "impl": QuestionAnsweringPipeline, + "tf": TFAutoModelForQuestionAnswering if is_tf_available() else None, + "pt": AutoModelForQuestionAnswering if is_torch_available() else None, + "default": { + "model": { + "pt": "distilbert-base-uncased-distilled-squad", + "tf": "distilbert-base-uncased-distilled-squad", }, - 'config': None, - 'tokenizer': 'distilbert-base-uncased' - } - } + "config": None, + "tokenizer": "distilbert-base-uncased", + }, + }, } -def pipeline(task: str, model: Optional = None, - config: Optional[Union[str, PretrainedConfig]] = None, - tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, - modelcard: Optional[Union[str, ModelCard]] = None, - **kwargs) -> Pipeline: +def pipeline( + task: str, + model: Optional = None, + config: Optional[Union[str, PretrainedConfig]] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + modelcard: Optional[Union[str, ModelCard]] = None, + **kwargs +) -> Pipeline: """ Utility factory method to build a pipeline. Pipeline are made of: @@ -852,11 +916,11 @@ def pipeline(task: str, model: Optional = None, framework = get_framework(model) targeted_task = SUPPORTED_TASKS[task] - task, model_class = targeted_task['impl'], targeted_task[framework] + task, model_class = targeted_task["impl"], targeted_task[framework] # Use default model/config/tokenizer for the task if no model is provided if model is None: - models, config, tokenizer = tuple(targeted_task['default'].values()) + models, config, tokenizer = tuple(targeted_task["default"].values()) model = models[framework] # Try to infer tokenizer from model or config name (if provided as str) @@ -867,8 +931,10 @@ def pipeline(task: str, model: Optional = None, tokenizer = config else: # Impossible to guest what is the right tokenizer here - raise Exception("Impossible to guess which tokenizer to use. " - "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer.") + raise Exception( + "Impossible to guess which tokenizer to use. " + "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer." + ) # Try to infer modelcard from model or config name (if provided as str) if modelcard is None: @@ -894,14 +960,18 @@ def pipeline(task: str, model: Optional = None, if isinstance(model, str): # Handle transparent TF/PT model conversion model_kwargs = {} - if framework == 'pt' and model.endswith('.h5'): - model_kwargs['from_tf'] = True - logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. ' - 'Trying to load the model with PyTorch.') - elif framework == 'tf' and model.endswith('.bin'): - model_kwargs['from_pt'] = True - logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. ' - 'Trying to load the model with Tensorflow.') + if framework == "pt" and model.endswith(".h5"): + model_kwargs["from_tf"] = True + logger.warning( + "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " + "Trying to load the model with PyTorch." + ) + elif framework == "tf" and model.endswith(".bin"): + model_kwargs["from_pt"] = True + logger.warning( + "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " + "Trying to load the model with Tensorflow." + ) model = model_class.from_pretrained(model, config=config, **model_kwargs) return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs) diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py index 376d110d3c..d109a655f8 100644 --- a/transformers/tests/configuration_common_test.py +++ b/transformers/tests/configuration_common_test.py @@ -32,10 +32,10 @@ class ConfigTester(object): def create_and_test_config_common_properties(self): config = self.config_class(**self.inputs_dict) - self.parent.assertTrue(hasattr(config, 'vocab_size')) - self.parent.assertTrue(hasattr(config, 'hidden_size')) - self.parent.assertTrue(hasattr(config, 'num_attention_heads')) - self.parent.assertTrue(hasattr(config, 'num_hidden_layers')) + self.parent.assertTrue(hasattr(config, "vocab_size")) + self.parent.assertTrue(hasattr(config, "hidden_size")) + self.parent.assertTrue(hasattr(config, "num_attention_heads")) + self.parent.assertTrue(hasattr(config, "num_hidden_layers")) def create_and_test_config_to_json_string(self): config = self.config_class(**self.inputs_dict) @@ -68,5 +68,6 @@ class ConfigTester(object): self.create_and_test_config_to_json_file() self.create_and_test_config_from_and_save_pretrained() + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py index b45f5aceed..71963df107 100644 --- a/transformers/tests/hf_api_test.py +++ b/transformers/tests/hf_api_test.py @@ -28,20 +28,15 @@ PASS = "__DUMMY_TRANSFORMERS_PASS__" FILES = [ ( "Test-{}.txt".format(int(time.time())), - os.path.join( - os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt" - ) + os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"), ), ( - "yoyo {}.txt".format(int(time.time())), # space is intentional - os.path.join( - os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt" - ) + "yoyo {}.txt".format(int(time.time())), # space is intentional + os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"), ), ] - class HfApiCommonTest(unittest.TestCase): _api = HfApi(endpoint="https://moon-staging.huggingface.co") @@ -76,11 +71,9 @@ class HfApiEndpointsTest(HfApiCommonTest): def test_presign_and_upload(self): for FILE_KEY, FILE_PATH in FILES: - access_url = self._api.presign_and_upload( - token=self._token, filename=FILE_KEY, filepath=FILE_PATH - ) + access_url = self._api.presign_and_upload(token=self._token, filename=FILE_KEY, filepath=FILE_PATH) self.assertIsInstance(access_url, six.string_types) - with open(FILE_PATH, 'r') as f: + with open(FILE_PATH, "r") as f: body = f.read() r = requests.get(access_url) self.assertEqual(r.text, body) @@ -93,7 +86,6 @@ class HfApiEndpointsTest(HfApiCommonTest): self.assertIsInstance(o, S3Obj) - class HfFolderTest(unittest.TestCase): def test_token_workflow(self): """ @@ -102,18 +94,12 @@ class HfFolderTest(unittest.TestCase): """ token = "token-{}".format(int(time.time())) HfFolder.save_token(token) - self.assertEqual( - HfFolder.get_token(), - token - ) + self.assertEqual(HfFolder.get_token(), token) HfFolder.delete_token() HfFolder.delete_token() # ^^ not an error, we test that the # second call does not fail. - self.assertEqual( - HfFolder.get_token(), - None - ) + self.assertEqual(HfFolder.get_token(), None) if __name__ == "__main__": diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py index b293b5726a..30fe33a904 100644 --- a/transformers/tests/model_card_test.py +++ b/transformers/tests/model_card_test.py @@ -21,44 +21,39 @@ import unittest from transformers.modelcard import ModelCard from .tokenization_tests_commons import TemporaryDirectory -class ModelCardTester(unittest.TestCase): +class ModelCardTester(unittest.TestCase): def setUp(self): - self.inputs_dict = {'model_details': { - 'Organization': 'testing', - 'Model date': 'today', - 'Model version': 'v2.1, Developed by Test Corp in 2019.', - 'Architecture': 'Convolutional Neural Network.', - }, - 'metrics': 'BLEU and ROUGE-1', - 'evaluation_data':{ - 'Datasets':{ - 'BLEU': 'My-great-dataset-v1', - 'ROUGE-1': 'My-short-dataset-v2.1', - }, - 'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf' - }, - 'training_data':{ - 'Dataset': 'English Wikipedia dump dated 2018-12-01', - 'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf' - }, - 'quantitative_analyses': { - 'BLEU': 55.1, - 'ROUGE-1': 76, - }, - } + self.inputs_dict = { + "model_details": { + "Organization": "testing", + "Model date": "today", + "Model version": "v2.1, Developed by Test Corp in 2019.", + "Architecture": "Convolutional Neural Network.", + }, + "metrics": "BLEU and ROUGE-1", + "evaluation_data": { + "Datasets": {"BLEU": "My-great-dataset-v1", "ROUGE-1": "My-short-dataset-v2.1",}, + "Preprocessing": "See details on https://arxiv.org/pdf/1810.03993.pdf", + }, + "training_data": { + "Dataset": "English Wikipedia dump dated 2018-12-01", + "Preprocessing": "Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf", + }, + "quantitative_analyses": {"BLEU": 55.1, "ROUGE-1": 76,}, + } def test_model_card_common_properties(self): modelcard = ModelCard.from_dict(self.inputs_dict) - self.assertTrue(hasattr(modelcard, 'model_details')) - self.assertTrue(hasattr(modelcard, 'intended_use')) - self.assertTrue(hasattr(modelcard, 'factors')) - self.assertTrue(hasattr(modelcard, 'metrics')) - self.assertTrue(hasattr(modelcard, 'evaluation_data')) - self.assertTrue(hasattr(modelcard, 'training_data')) - self.assertTrue(hasattr(modelcard, 'quantitative_analyses')) - self.assertTrue(hasattr(modelcard, 'ethical_considerations')) - self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations')) + self.assertTrue(hasattr(modelcard, "model_details")) + self.assertTrue(hasattr(modelcard, "intended_use")) + self.assertTrue(hasattr(modelcard, "factors")) + self.assertTrue(hasattr(modelcard, "metrics")) + self.assertTrue(hasattr(modelcard, "evaluation_data")) + self.assertTrue(hasattr(modelcard, "training_data")) + self.assertTrue(hasattr(modelcard, "quantitative_analyses")) + self.assertTrue(hasattr(modelcard, "ethical_considerations")) + self.assertTrue(hasattr(modelcard, "caveats_and_recommendations")) def test_model_card_to_json_string(self): modelcard = ModelCard.from_dict(self.inputs_dict) @@ -70,7 +65,7 @@ class ModelCardTester(unittest.TestCase): model_card_first = ModelCard.from_dict(self.inputs_dict) with TemporaryDirectory() as tmpdirname: - filename = os.path.join(tmpdirname, u"modelcard.json") + filename = os.path.join(tmpdirname, "modelcard.json") model_card_first.to_json_file(filename) model_card_second = ModelCard.from_json_file(filename) @@ -85,5 +80,6 @@ class ModelCardTester(unittest.TestCase): self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict()) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py index b726fd9278..f798af95bc 100644 --- a/transformers/tests/modeling_albert_test.py +++ b/transformers/tests/modeling_albert_test.py @@ -20,14 +20,18 @@ import unittest from transformers import is_torch_available -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): - from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM, - AlbertForSequenceClassification, AlbertForQuestionAnswering, - ) + from transformers import ( + AlbertConfig, + AlbertModel, + AlbertForMaskedLM, + AlbertForSequenceClassification, + AlbertForQuestionAnswering, + ) from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -37,33 +41,33 @@ class AlbertModelTest(CommonTestCases.CommonModelTester): all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else () class AlbertModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - embedding_size=16, - hidden_size=36, - num_hidden_layers=6, - num_hidden_groups=6, - num_attention_heads=6, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + embedding_size=16, + hidden_size=36, + num_hidden_layers=6, + num_hidden_groups=6, + num_attention_heads=6, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -120,16 +124,17 @@ class AlbertModelTest(CommonTestCases.CommonModelTester): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, - num_hidden_groups=self.num_hidden_groups) + num_hidden_groups=self.num_hidden_groups, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_albert_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = AlbertModel(config=config) model.to(torch_device) model.eval() @@ -142,66 +147,79 @@ class AlbertModelTest(CommonTestCases.CommonModelTester): "pooled_output": pooled_output, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) - - def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_albert_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = AlbertForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) + loss, prediction_scores = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels + ) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.check_loss_output(result) - def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_albert_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = AlbertForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - start_positions=sequence_labels, end_positions=sequence_labels) + loss, start_logits, end_logits = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) result = { "loss": loss, "start_logits": start_logits, "end_logits": end_logits, } - self.parent.assertListEqual( - list(result["start_logits"].size()), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].size()), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) - - def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_albert_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = AlbertForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + loss, logits = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels + ) result = { "loss": loss, "logits": logits, } - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) - def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -233,5 +251,6 @@ class AlbertModelTest(CommonTestCases.CommonModelTester): model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py index 871a262fe8..3bdaa8a378 100644 --- a/transformers/tests/modeling_auto_test.py +++ b/transformers/tests/modeling_auto_test.py @@ -25,14 +25,21 @@ from transformers import is_torch_available from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER if is_torch_available(): - from transformers import (AutoConfig, BertConfig, - AutoModel, BertModel, - AutoModelWithLMHead, BertForMaskedLM, - AutoModelForSequenceClassification, BertForSequenceClassification, - AutoModelForQuestionAnswering, BertForQuestionAnswering) + from transformers import ( + AutoConfig, + BertConfig, + AutoModel, + BertModel, + AutoModelWithLMHead, + BertForMaskedLM, + AutoModelForSequenceClassification, + BertForSequenceClassification, + AutoModelForQuestionAnswering, + BertForQuestionAnswering, + ) from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP - from .modeling_common_test import (CommonTestCases, ids_tensor) + from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester @@ -75,7 +82,9 @@ class AutoModelTest(unittest.TestCase): self.assertIsInstance(config, BertConfig) model = AutoModelForSequenceClassification.from_pretrained(model_name) - model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True) + model, loading_info = AutoModelForSequenceClassification.from_pretrained( + model_name, output_loading_info=True + ) self.assertIsNotNone(model) self.assertIsInstance(model, BertForSequenceClassification) diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py index a5adff8f68..6711aded69 100644 --- a/transformers/tests/modeling_bert_test.py +++ b/transformers/tests/modeling_bert_test.py @@ -20,51 +20,68 @@ import unittest from transformers import is_torch_available -from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): - from transformers import (BertConfig, BertModel, BertForMaskedLM, - BertForNextSentencePrediction, BertForPreTraining, - BertForQuestionAnswering, BertForSequenceClassification, - BertForTokenClassification, BertForMultipleChoice) + from transformers import ( + BertConfig, + BertModel, + BertForMaskedLM, + BertForNextSentencePrediction, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + BertForMultipleChoice, + ) from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP @require_torch class BertModelTest(CommonTestCases.CommonModelTester): - all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction, - BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, - BertForTokenClassification) if is_torch_available() else () + all_model_classes = ( + ( + BertModel, + BertForMaskedLM, + BertForNextSentencePrediction, + BertForPreTraining, + BertForQuestionAnswering, + BertForSequenceClassification, + BertForTokenClassification, + ) + if is_torch_available() + else () + ) class BertModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -119,25 +136,44 @@ class BertModelTest(CommonTestCases.CommonModelTester): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def prepare_config_and_inputs_for_decoder(self): - config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = self.prepare_config_and_inputs() config.is_decoder = True encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size]) encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) - return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask + return ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ) def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = BertModel(config=config) model.to(torch_device) model.eval() @@ -150,16 +186,38 @@ class BertModelTest(CommonTestCases.CommonModelTester): "pooled_output": pooled_output, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) - def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): + def create_and_check_bert_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): model = BertModel(config) model.to(torch_device) model.eval() - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) - sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states) + sequence_output, pooled_output = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + sequence_output, pooled_output = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + encoder_hidden_states=encoder_hidden_states, + ) sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = { @@ -167,122 +225,171 @@ class BertModelTest(CommonTestCases.CommonModelTester): "pooled_output": pooled_output, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) - def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = BertForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) + loss, prediction_scores = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels + ) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.check_loss_output(result) - def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): + def create_and_check_bert_model_for_masked_lm_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): model = BertForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) - loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states) + loss, prediction_scores = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + masked_lm_labels=token_labels, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + loss, prediction_scores = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + masked_lm_labels=token_labels, + encoder_hidden_states=encoder_hidden_states, + ) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.check_loss_output(result) - def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_next_sequence_prediction( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = BertForNextSentencePrediction(config=config) model.to(torch_device) model.eval() - loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels) + loss, seq_relationship_score = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + next_sentence_label=sequence_labels, + ) result = { "loss": loss, "seq_relationship_score": seq_relationship_score, } - self.parent.assertListEqual( - list(result["seq_relationship_score"].size()), - [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result) - def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_pretraining( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = BertForPreTraining(config=config) model.to(torch_device) model.eval() - loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - masked_lm_labels=token_labels, next_sentence_label=sequence_labels) + loss, prediction_scores, seq_relationship_score = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + masked_lm_labels=token_labels, + next_sentence_label=sequence_labels, + ) result = { "loss": loss, "prediction_scores": prediction_scores, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) - self.parent.assertListEqual( - list(result["seq_relationship_score"].size()), - [self.batch_size, 2]) + list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) + self.parent.assertListEqual(list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result) - def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = BertForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - start_positions=sequence_labels, end_positions=sequence_labels) + loss, start_logits, end_logits = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) result = { "loss": loss, "start_logits": start_logits, "end_logits": end_logits, } - self.parent.assertListEqual( - list(result["start_logits"].size()), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].size()), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) - def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = BertForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + loss, logits = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels + ) result = { "loss": loss, "logits": logits, } - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) - def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = BertForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + loss, logits = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels + ) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.seq_length, self.num_labels]) + list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels] + ) self.check_loss_output(result) - def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_choices = self.num_choices model = BertForMultipleChoice(config=config) model.to(torch_device) @@ -290,24 +397,31 @@ class BertModelTest(CommonTestCases.CommonModelTester): multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - loss, logits = model(multiple_choice_inputs_ids, - attention_mask=multiple_choice_input_mask, - token_type_ids=multiple_choice_token_type_ids, - labels=choice_labels) + loss, logits = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) result = { "loss": loss, "logits": logits, } - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.num_choices]) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) self.check_loss_output(result) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 2116651f4a..6834c78d15 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -36,34 +36,48 @@ if is_torch_available(): import torch import numpy as np - from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel, - BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers import ( + AdaptiveEmbedding, + PretrainedConfig, + PreTrainedModel, + BertModel, + BertConfig, + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + GPT2LMHeadModel, + GPT2Config, + GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + ) if sys.version_info[0] == 2: import cPickle as pickle class TemporaryDirectory(object): """Context manager for tempfile.mkdtemp() so it's usable with "with" statement.""" + def __enter__(self): self.name = tempfile.mkdtemp() return self.name + def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.name) + + else: import pickle + TemporaryDirectory = tempfile.TemporaryDirectory unicode = str + def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if '_range' in key or '_std' in key or 'initializer_factor' in key: + if "_range" in key or "_std" in key or "initializer_factor" in key: setattr(configs_no_init, key, 0.0) return configs_no_init -class CommonTestCases: +class CommonTestCases: @require_torch class CommonModelTester(unittest.TestCase): @@ -108,8 +122,11 @@ class CommonTestCases: model = model_class(config=configs_no_init) for name, param in model.named_parameters(): if param.requires_grad: - self.assertIn(param.data.mean().item(), [0.0, 1.0], - msg="Parameter {} of model {} seems not properly initialized".format(name, model_class)) + self.assertIn( + param.data.mean().item(), + [0.0, 1.0], + msg="Parameter {} of model {} seems not properly initialized".format(name, model_class), + ) def test_determinism(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -131,10 +148,22 @@ class CommonTestCases: def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length - encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length - decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length - encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length + decoder_seq_length = ( + self.model_tester.decoder_seq_length + if hasattr(self.model_tester, "decoder_seq_length") + else self.model_tester.seq_length + ) + encoder_seq_length = ( + self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "encoder_seq_length") + else self.model_tester.seq_length + ) + decoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length + ) + encoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length + ) for model_class in self.all_model_classes: config.output_attentions = True @@ -150,23 +179,20 @@ class CommonTestCases: self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, - encoder_seq_length , - encoder_key_length]) + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) out_len = len(outputs) if self.is_encoder_decoder: self.assertEqual(out_len % 2, 0) - decoder_attentions = outputs[(out_len // 2)-1] + decoder_attentions = outputs[(out_len // 2) - 1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, - decoder_seq_length, - decoder_key_length - ]) + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) # Check attention is always last and order is fine config.output_attentions = True @@ -184,9 +210,8 @@ class CommonTestCases: self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(self_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length]) + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) def test_torchscript(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -215,7 +240,7 @@ class CommonTestCases: model = model_class(config=configs_no_init) model.to(torch_device) model.eval() - inputs = inputs_dict['input_ids'] # Let's keep only input_ids + inputs = inputs_dict["input_ids"] # Let's keep only input_ids try: traced_gpt2 = torch.jit.trace(model, inputs) @@ -269,12 +294,14 @@ class CommonTestCases: # Prepare head_mask # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) - head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device) + head_mask = torch.ones( + self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device + ) head_mask[0, 0] = 0 head_mask[-1, :-1] = 0 head_mask.requires_grad_(requires_grad=True) inputs = inputs_dict.copy() - inputs['head_mask'] = head_mask + inputs["head_mask"] = head_mask outputs = model(**inputs) @@ -289,21 +316,20 @@ class CommonTestCases: # Remove Nan for t in attentions: - self.assertLess(torch.sum(torch.isnan(t)), t.numel() / 4) # Check we don't have more than 25% nans (arbitrary) - attentions = [t.masked_fill(torch.isnan(t), 0.0) for t in attentions] # remove them (the test is less complete) + self.assertLess( + torch.sum(torch.isnan(t)), t.numel() / 4 + ) # Check we don't have more than 25% nans (arbitrary) + attentions = [ + t.masked_fill(torch.isnan(t), 0.0) for t in attentions + ] # remove them (the test is less complete) self.assertIsNotNone(multihead_outputs) self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers) - self.assertAlmostEqual( - attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual( - attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual( - attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) - self.assertAlmostEqual( - attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) - self.assertNotEqual( - attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) + self.assertAlmostEqual(attentions[0][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[0][..., -1, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[1][..., 0, :, :].flatten().sum().item(), 0.0) + self.assertAlmostEqual(attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0) + self.assertNotEqual(attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) def test_head_pruning(self): if not self.test_pruning: @@ -320,20 +346,16 @@ class CommonTestCases: model = model_class(config=config) model.to(torch_device) model.eval() - heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), - -1: [0]} + heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} model.prune_heads(heads_to_prune) with torch.no_grad(): outputs = model(**inputs_dict) attentions = outputs[-1] - self.assertEqual( - attentions[0].shape[-3], 1) - self.assertEqual( - attentions[1].shape[-3], self.model_tester.num_attention_heads) - self.assertEqual( - attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) + self.assertEqual(attentions[0].shape[-3], 1) + self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) + self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) def test_head_pruning_save_load_from_pretrained(self): if not self.test_pruning: @@ -350,8 +372,7 @@ class CommonTestCases: model = model_class(config=config) model.to(torch_device) model.eval() - heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), - -1: [0]} + heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} model.prune_heads(heads_to_prune) with TemporaryDirectory() as temp_dir_name: @@ -366,7 +387,6 @@ class CommonTestCases: self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) - def test_head_pruning_save_load_from_config_init(self): if not self.test_pruning: return @@ -380,8 +400,7 @@ class CommonTestCases: config.output_attentions = True config.output_hidden_states = False - heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), - -1: [0]} + heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} config.pruned_heads = heads_to_prune model = model_class(config=config) @@ -446,7 +465,7 @@ class CommonTestCases: outputs = model(**inputs_dict) attentions = outputs[-1] - self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1) + self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2) self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2) self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) @@ -470,8 +489,13 @@ class CommonTestCases: self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length, - self.model_tester.hidden_size]) + [ + self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "encoder_seq_length") + else self.model_tester.seq_length, + self.model_tester.hidden_size, + ], + ) def test_resize_tokens_embeddings(self): original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -512,15 +536,10 @@ class CommonTestCases: for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance( - model.get_input_embeddings(), - (torch.nn.Embedding, AdaptiveEmbedding) - ) + self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding)) model.set_input_embeddings(torch.nn.Embedding(10, 10)) x = model.get_output_embeddings() - self.assertTrue( - x is None or isinstance(x, torch.nn.Linear) - ) + self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) def test_tie_model_weights(self): if not self.test_torchscript: @@ -602,30 +621,30 @@ class CommonTestCases: outputs = model(**inputs_dict) class GPTModelTester(CommonModelTester): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_position_ids=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - n_positions=33, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - n_choices=3, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - scope=None, - config_class=None, - base_model_class=None, - lm_head_model_class=None, - double_head_model_class=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_position_ids=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + n_positions=33, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + n_choices=3, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + scope=None, + config_class=None, + base_model_class=None, + lm_head_model_class=None, + double_head_model_class=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -676,13 +695,14 @@ class CommonTestCases: n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) - return (config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids) + return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) - def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): + def create_and_check_base_model( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): model = self.base_model_class(config) model.to(torch_device) model.eval() @@ -694,12 +714,12 @@ class CommonTestCases: hidden_state = outputs[0] self.parent.assertListEqual( - list(hidden_state.size()), - [self.batch_size, self.n_choices, self.seq_length, self.hidden_size]) + list(hidden_state.size()), [self.batch_size, self.n_choices, self.seq_length, self.hidden_size] + ) - - def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): + def create_and_check_lm_head( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): model = self.lm_head_model_class(config) model.to(torch_device) model.eval() @@ -709,14 +729,13 @@ class CommonTestCases: total_voc = self.vocab_size self.parent.assertListEqual( - list(lm_logits.size()), - [self.batch_size, self.n_choices, self.seq_length, total_voc]) - self.parent.assertListEqual( - list(loss.size()), - []) + list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc] + ) + self.parent.assertListEqual(list(loss.size()), []) - def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): + def create_and_check_presents( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): for model_class in self.all_model_classes: model = model_class(config) model.to(torch_device) @@ -727,30 +746,39 @@ class CommonTestCases: self.parent.assertEqual(self.num_hidden_layers, len(presents)) self.parent.assertListEqual( list(presents[0].size()), - [2, self.batch_size * self.n_choices, self.num_attention_heads, - self.seq_length, self.hidden_size // self.num_attention_heads]) + [ + 2, + self.batch_size * self.n_choices, + self.num_attention_heads, + self.seq_length, + self.hidden_size // self.num_attention_heads, + ], + ) - def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids): + def create_and_check_double_heads( + self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids + ): model = self.double_head_model_class(config) model.to(torch_device) model.eval() with torch.no_grad(): - outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, - token_type_ids=token_type_ids, position_ids=position_ids) + outputs = model( + input_ids, + mc_token_ids, + lm_labels=lm_labels, + mc_labels=mc_labels, + token_type_ids=token_type_ids, + position_ids=position_ids, + ) lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] loss = [lm_loss, mc_loss] total_voc = self.vocab_size self.parent.assertListEqual( - list(lm_logits.size()), - [self.batch_size, self.n_choices, self.seq_length, total_voc]) - self.parent.assertListEqual( - list(mc_logits.size()), - [self.batch_size, self.n_choices]) - self.parent.assertListEqual( - [list(l.size()) for l in loss], - [[], []]) + list(lm_logits.size()), [self.batch_size, self.n_choices, self.seq_length, total_voc] + ) + self.parent.assertListEqual(list(mc_logits.size()), [self.batch_size, self.n_choices]) + self.parent.assertListEqual([list(l.size()) for l in loss], [[], []]) def create_and_check_model_from_pretrained(self): for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]: @@ -759,9 +787,8 @@ class CommonTestCases: def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, position_ids, - mc_labels, lm_labels, mc_token_ids) = config_and_inputs - inputs_dict = {'input_ids': input_ids} + (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids) = config_and_inputs + inputs_dict = {"input_ids": input_ids} return config, inputs_dict def run_common_tests(self, test_presents=False): @@ -791,10 +818,10 @@ class ConfigTester(object): def create_and_test_config_common_properties(self): config = self.config_class(**self.inputs_dict) - self.parent.assertTrue(hasattr(config, 'vocab_size')) - self.parent.assertTrue(hasattr(config, 'hidden_size')) - self.parent.assertTrue(hasattr(config, 'num_attention_heads')) - self.parent.assertTrue(hasattr(config, 'num_hidden_layers')) + self.parent.assertTrue(hasattr(config, "vocab_size")) + self.parent.assertTrue(hasattr(config, "hidden_size")) + self.parent.assertTrue(hasattr(config, "num_attention_heads")) + self.parent.assertTrue(hasattr(config, "num_hidden_layers")) def create_and_test_config_to_json_string(self): config = self.config_class(**self.inputs_dict) diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py index ed0d62d1e6..9b71b1dd50 100644 --- a/transformers/tests/modeling_ctrl_test.py +++ b/transformers/tests/modeling_ctrl_test.py @@ -21,10 +21,9 @@ import pdb from transformers import is_torch_available if is_torch_available(): - from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, - CTRLLMHeadModel) + from transformers import CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -39,32 +38,32 @@ class CTRLModelTest(CommonTestCases.CommonModelTester): test_head_masking = False class CTRLModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_token_type_ids=True, - use_input_mask=True, - use_labels=True, - use_mc_token_ids=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -129,12 +128,20 @@ class CTRLModelTest(CommonTestCases.CommonModelTester): head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) - return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = CTRLModel(config=config) @@ -150,8 +157,8 @@ class CTRLModelTest(CommonTestCases.CommonModelTester): "presents": presents, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertEqual(len(result["presents"]), config.n_layer) def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): @@ -161,29 +168,28 @@ class CTRLModelTest(CommonTestCases.CommonModelTester): loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - result = { - "loss": loss, - "lm_logits": lm_logits - } + result = {"loss": loss, "lm_logits": lm_logits} + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) - + list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, input_mask, head_mask, token_type_ids, - mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs - inputs_dict = { - 'input_ids': input_ids, - 'token_type_ids': token_type_ids, - 'head_mask': head_mask - } + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask} return config, inputs_dict diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py index ac6f5d248e..5b4f4683de 100644 --- a/transformers/tests/modeling_distilbert_test.py +++ b/transformers/tests/modeling_distilbert_test.py @@ -21,11 +21,16 @@ import unittest from transformers import is_torch_available if is_torch_available(): - from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM, - DistilBertForTokenClassification, - DistilBertForQuestionAnswering, DistilBertForSequenceClassification) + from transformers import ( + DistilBertConfig, + DistilBertModel, + DistilBertForMaskedLM, + DistilBertForTokenClassification, + DistilBertForQuestionAnswering, + DistilBertForSequenceClassification, + ) -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -33,39 +38,42 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch class DistilBertModelTest(CommonTestCases.CommonModelTester): - all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, - DistilBertForSequenceClassification) if is_torch_available() else None + all_model_classes = ( + (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification) + if is_torch_available() + else None + ) test_pruning = True test_torchscript = True test_resize_embeddings = True test_head_masking = True class DistilBertModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=False, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -114,16 +122,17 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_model( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = DistilBertModel(config=config) model.to(torch_device) model.eval() @@ -134,10 +143,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): "sequence_output": sequence_output, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) - def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_masked_lm( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = DistilBertForMaskedLM(config=config) model.to(torch_device) model.eval() @@ -147,29 +158,31 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): "prediction_scores": prediction_scores, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.check_loss_output(result) - def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_question_answering( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = DistilBertForQuestionAnswering(config=config) model.to(torch_device) model.eval() - loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels) + loss, start_logits, end_logits = model( + input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels + ) result = { "loss": loss, "start_logits": start_logits, "end_logits": end_logits, } - self.parent.assertListEqual( - list(result["start_logits"].size()), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].size()), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) - def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_sequence_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = DistilBertForSequenceClassification(config) model.to(torch_device) @@ -179,12 +192,12 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): "loss": loss, "logits": logits, } - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) - def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_token_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = DistilBertForTokenClassification(config=config) model.to(torch_device) @@ -196,14 +209,14 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): "logits": logits, } self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.seq_length, self.num_labels]) + list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels] + ) self.check_loss_output(result) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask} + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -239,5 +252,6 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): # model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) # self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/modeling_encoder_decoder_test.py b/transformers/tests/modeling_encoder_decoder_test.py index 64e86df8f5..491c502bac 100644 --- a/transformers/tests/modeling_encoder_decoder_test.py +++ b/transformers/tests/modeling_encoder_decoder_test.py @@ -39,13 +39,13 @@ class EncoderDecoderModelTest(unittest.TestCase): def test_model2model_from_pretrained_not_bert(self): logging.basicConfig(level=logging.INFO) with self.assertRaises(ValueError): - _ = Model2Model.from_pretrained('roberta') + _ = Model2Model.from_pretrained("roberta") with self.assertRaises(ValueError): - _ = Model2Model.from_pretrained('distilbert') + _ = Model2Model.from_pretrained("distilbert") with self.assertRaises(ValueError): - _ = Model2Model.from_pretrained('does-not-exist') + _ = Model2Model.from_pretrained("does-not-exist") if __name__ == "__main__": diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py index ad2ec1fd91..2706166b33 100644 --- a/transformers/tests/modeling_gpt2_test.py +++ b/transformers/tests/modeling_gpt2_test.py @@ -21,10 +21,15 @@ import unittest from transformers import is_torch_available if is_torch_available(): - from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, - GPT2LMHeadModel, GPT2DoubleHeadsModel) + from transformers import ( + GPT2Config, + GPT2Model, + GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + GPT2LMHeadModel, + GPT2DoubleHeadsModel, + ) -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -35,32 +40,32 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else () class GPT2ModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_token_type_ids=True, - use_input_mask=True, - use_labels=True, - use_mc_token_ids=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -125,12 +130,20 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) - return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) @@ -146,8 +159,8 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): "presents": presents, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertEqual(len(result["presents"]), config.n_layer) def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): @@ -157,63 +170,58 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - result = { - "loss": loss, - "lm_logits": lm_logits - } + result = {"loss": loss, "lm_logits": lm_logits} + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) - def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args): + def create_and_check_double_lm_head_model( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args + ): model = GPT2DoubleHeadsModel(config) model.to(torch_device) model.eval() - multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() - inputs = {'input_ids': multiple_choice_inputs_ids, - 'mc_token_ids': mc_token_ids, - 'attention_mask': multiple_choice_input_mask, - 'token_type_ids': multiple_choice_token_type_ids, - 'lm_labels': multiple_choice_inputs_ids} + inputs = { + "input_ids": multiple_choice_inputs_ids, + "mc_token_ids": mc_token_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + "lm_labels": multiple_choice_inputs_ids, + } loss, lm_logits, mc_logits, _ = model(**inputs) - result = { - "loss": loss, - "lm_logits": lm_logits, - "mc_logits": mc_logits - } + result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits} + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), - [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]) - self.parent.assertListEqual( - list(result["mc_logits"].size()), - [self.batch_size, self.num_choices]) + list(result["lm_logits"].size()), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size] + ) + self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, input_mask, head_mask, token_type_ids, - mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs - inputs_dict = { - 'input_ids': input_ids, - 'token_type_ids': token_type_ids, - 'head_mask': head_mask - } + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask} return config, inputs_dict diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py index 1880febcae..f22a0b760c 100644 --- a/transformers/tests/modeling_openai_test.py +++ b/transformers/tests/modeling_openai_test.py @@ -21,10 +21,15 @@ import unittest from transformers import is_torch_available if is_torch_available(): - from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, - OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) + from transformers import ( + OpenAIGPTConfig, + OpenAIGPTModel, + OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + OpenAIGPTLMHeadModel, + OpenAIGPTDoubleHeadsModel, + ) -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -32,33 +37,35 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): - all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else () + all_model_classes = ( + (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else () + ) class OpenAIGPTModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -116,9 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTModel(config=config) @@ -129,12 +134,10 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): model(input_ids, token_type_ids=token_type_ids) (sequence_output,) = model(input_ids) - result = { - "sequence_output": sequence_output - } + result = {"sequence_output": sequence_output} self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTLMHeadModel(config) @@ -143,17 +146,12 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids) - result = { - "loss": loss, - "lm_logits": lm_logits - } + result = {"loss": loss, "lm_logits": lm_logits} + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args): model = OpenAIGPTDoubleHeadsModel(config) @@ -162,26 +160,25 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids) - result = { - "loss": loss, - "lm_logits": lm_logits - } + result = {"loss": loss, "lm_logits": lm_logits} + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["lm_logits"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["lm_logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = { - 'input_ids': input_ids, - 'token_type_ids': token_type_ids, - 'head_mask': head_mask - } + ( + config, + input_ids, + head_mask, + token_type_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "head_mask": head_mask} return config, inputs_dict diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py index 732e589cdf..451dafe08e 100644 --- a/transformers/tests/modeling_roberta_test.py +++ b/transformers/tests/modeling_roberta_test.py @@ -22,12 +22,17 @@ from transformers import is_torch_available if is_torch_available(): import torch - from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, - RobertaForSequenceClassification, RobertaForTokenClassification) + from transformers import ( + RobertaConfig, + RobertaModel, + RobertaForMaskedLM, + RobertaForSequenceClassification, + RobertaForTokenClassification, + ) from transformers.modeling_roberta import RobertaEmbeddings from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -38,31 +43,31 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else () class RobertaModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -116,17 +121,17 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, - token_labels, choice_labels): + def create_and_check_roberta_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = RobertaModel(config=config) model.to(torch_device) model.eval() @@ -139,47 +144,59 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): "pooled_output": pooled_output, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) - def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, - token_labels, choice_labels): + def create_and_check_roberta_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = RobertaForMaskedLM(config=config) model.to(torch_device) model.eval() - loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) + loss, prediction_scores = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels + ) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.check_loss_output(result) - def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels): + def create_and_check_roberta_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = RobertaForTokenClassification(config=config) model.to(torch_device) model.eval() - loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, - labels=token_labels) + loss, logits = model( + input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels + ) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.seq_length, self.num_labels]) + list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels] + ) self.check_loss_output(result) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -214,18 +231,12 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): model = RobertaEmbeddings(config=config) input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]]) - expected_positions = torch.as_tensor([[ - 0 + model.padding_idx + 1, - 1 + model.padding_idx + 1, - 2 + model.padding_idx + 1, - model.padding_idx - ]]) + expected_positions = torch.as_tensor( + [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]] + ) position_ids = model.create_position_ids_from_input_ids(input_ids) - self.assertEqual( - position_ids.shape, - expected_positions.shape - ) + self.assertEqual(position_ids.shape, expected_positions.shape) self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) def test_create_position_ids_from_inputs_embeds(self): @@ -247,69 +258,47 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): ] expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions]) position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds) - self.assertEqual( - position_ids.shape, - expected_positions.shape - ) - self.assertTrue( - torch.all(torch.eq(position_ids, expected_positions)) - ) + self.assertEqual(position_ids.shape, expected_positions.shape) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) class RobertaModelIntegrationTest(unittest.TestCase): - @slow def test_inference_masked_lm(self): - model = RobertaForMaskedLM.from_pretrained('roberta-base') + model = RobertaForMaskedLM.from_pretrained("roberta-base") - input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 11, 50265)) - self.assertEqual( - output.shape, - expected_shape - ) + self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. expected_slice = torch.Tensor( - [[[33.8843, -4.3107, 22.7779], - [ 4.6533, -2.8099, 13.6252], - [ 1.8222, -3.6898, 8.8600]]] - ) - self.assertTrue( - torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) + [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]] ) + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)) @slow def test_inference_no_head(self): - model = RobertaModel.from_pretrained('roberta-base') + model = RobertaModel.from_pretrained("roberta-base") - input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] # compare the actual values for a slice. expected_slice = torch.Tensor( - [[[-0.0231, 0.0782, 0.0074], - [-0.1854, 0.0539, -0.0174], - [ 0.0548, 0.0799, 0.1687]]] - ) - self.assertTrue( - torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3) + [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]] ) + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)) @slow def test_inference_classification_head(self): - model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli') + model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli") - input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = torch.Size((1, 3)) - self.assertEqual( - output.shape, - expected_shape - ) - expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]]) - self.assertTrue( - torch.allclose(output, expected_tensor, atol=1e-3) - ) + self.assertEqual(output.shape, expected_shape) + expected_tensor = torch.Tensor([[-0.9469, 0.3913, 0.5118]]) + self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-3)) if __name__ == "__main__": diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py index 9fd9a4b304..3feb61a622 100644 --- a/transformers/tests/modeling_t5_test.py +++ b/transformers/tests/modeling_t5_test.py @@ -20,12 +20,12 @@ import unittest from transformers import is_torch_available -from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor, floats_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): - from transformers import (T5Config, T5Model, T5WithLMHeadModel) + from transformers import T5Config, T5Model, T5WithLMHeadModel from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP @@ -39,26 +39,26 @@ class T5ModelTest(CommonTestCases.CommonModelTester): is_encoder_decoder = True class T5ModelTester(object): - - def __init__(self, - parent, - batch_size=13, - encoder_seq_length=7, - decoder_seq_length=9, - is_training=True, - use_attention_mask=True, - use_labels=True, - vocab_size=99, - n_positions=14, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - d_ff=37, - relative_attention_num_buckets=8, - dropout_rate=0.1, - initializer_factor=0.002, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + encoder_seq_length=7, + decoder_seq_length=9, + is_training=True, + use_attention_mask=True, + use_labels=True, + vocab_size=99, + n_positions=14, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + d_ff=37, + relative_attention_num_buckets=8, + dropout_rate=0.1, + initializer_factor=0.002, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.encoder_seq_length = encoder_seq_length @@ -101,60 +101,96 @@ class T5ModelTest(CommonTestCases.CommonModelTester): num_heads=self.num_attention_heads, relative_attention_num_buckets=self.relative_attention_num_buckets, dropout_rate=self.dropout_rate, - initializer_factor=self.initializer_factor) + initializer_factor=self.initializer_factor, + ) - return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels) + return ( + config, + encoder_input_ids, + decoder_input_ids, + encoder_attention_mask, + decoder_attention_mask, + decoder_lm_labels, + ) def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels): + def create_and_check_t5_model( + self, + config, + encoder_input_ids, + decoder_input_ids, + encoder_attention_mask, + decoder_attention_mask, + decoder_lm_labels, + ): model = T5Model(config=config) model.eval() - decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids, - decoder_input_ids=decoder_input_ids, - encoder_attention_mask=encoder_attention_mask, - decoder_attention_mask=decoder_attention_mask) - decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids, - decoder_input_ids=decoder_input_ids) + decoder_output, encoder_output = model( + encoder_input_ids=encoder_input_ids, + decoder_input_ids=decoder_input_ids, + encoder_attention_mask=encoder_attention_mask, + decoder_attention_mask=decoder_attention_mask, + ) + decoder_output, encoder_output = model( + encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids + ) result = { "encoder_output": encoder_output, "decoder_output": decoder_output, } self.parent.assertListEqual( - list(result["encoder_output"].size()), - [self.batch_size, self.encoder_seq_length, self.hidden_size]) + list(result["encoder_output"].size()), [self.batch_size, self.encoder_seq_length, self.hidden_size] + ) self.parent.assertListEqual( - list(result["decoder_output"].size()), - [self.batch_size, self.decoder_seq_length, self.hidden_size]) + list(result["decoder_output"].size()), [self.batch_size, self.decoder_seq_length, self.hidden_size] + ) - - def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels): + def create_and_check_t5_with_lm_head( + self, + config, + encoder_input_ids, + decoder_input_ids, + encoder_attention_mask, + decoder_attention_mask, + decoder_lm_labels, + ): model = T5WithLMHeadModel(config=config) model.eval() - outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, - decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels) + outputs = model( + encoder_input_ids=encoder_input_ids, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_lm_labels=decoder_lm_labels, + ) loss, prediction_scores = outputs[0], outputs[1] result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( - list(result["prediction_scores"].size()), - [self.batch_size, self.decoder_seq_length, self.vocab_size]) + list(result["prediction_scores"].size()), [self.batch_size, self.decoder_seq_length, self.vocab_size] + ) self.check_loss_output(result) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, - decoder_attention_mask, decoder_lm_labels) = config_and_inputs - inputs_dict = {'encoder_input_ids': encoder_input_ids, - 'decoder_input_ids': decoder_input_ids, - 'decoder_attention_mask': decoder_attention_mask, - 'encoder_attention_mask': encoder_attention_mask} + ( + config, + encoder_input_ids, + decoder_input_ids, + encoder_attention_mask, + decoder_attention_mask, + decoder_lm_labels, + ) = config_and_inputs + inputs_dict = { + "encoder_input_ids": encoder_input_ids, + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": decoder_attention_mask, + "encoder_attention_mask": encoder_attention_mask, + } return config, inputs_dict def setUp(self): @@ -178,5 +214,6 @@ class T5ModelTest(CommonTestCases.CommonModelTester): model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py index 374417cfe2..0406592d54 100644 --- a/transformers/tests/modeling_tf_albert_test.py +++ b/transformers/tests/modeling_tf_albert_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import sys -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,47 +27,48 @@ from transformers import AlbertConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM, - TFAlbertForSequenceClassification, - TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_albert import ( + TFAlbertModel, + TFAlbertForMaskedLM, + TFAlbertForSequenceClassification, + TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) @require_tf class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): all_model_classes = ( - TFAlbertModel, - TFAlbertForMaskedLM, - TFAlbertForSequenceClassification - ) if is_tf_available() else () + (TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification) if is_tf_available() else () + ) class TFAlbertModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - embedding_size=16, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + embedding_size=16, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -93,27 +94,22 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): self.scope = scope def prepare_config_and_inputs(self): - input_ids = ids_tensor( - [self.batch_size, self.seq_length], self.vocab_size) + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: - input_mask = ids_tensor( - [self.batch_size, self.seq_length], vocab_size=2) + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: - token_type_ids = ids_tensor( - [self.batch_size, self.seq_length], self.type_vocab_size) + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: - sequence_labels = ids_tensor( - [self.batch_size], self.type_sequence_label_size) - token_labels = ids_tensor( - [self.batch_size, self.seq_length], self.num_labels) + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = AlbertConfig( @@ -127,19 +123,20 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_albert_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFAlbertModel(config=config) # inputs = {'input_ids': input_ids, # 'attention_mask': input_mask, # 'token_type_ids': token_type_ids} # sequence_output, pooled_output = model(**inputs) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} sequence_output, pooled_output = model(inputs) inputs = [input_ids, input_mask] @@ -152,50 +149,52 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): "pooled_output": pooled_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) - self.parent.assertListEqual(list(result["pooled_output"].shape), [ - self.batch_size, self.hidden_size]) + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) + self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) - def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_albert_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFAlbertForMaskedLM(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - prediction_scores, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (prediction_scores,) = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_albert_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = TFAlbertForSequenceClassification(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - logits, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) result = { "logits": logits.numpy(), } - self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, - 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self) - self.config_tester = ConfigTester( - self, config_class=AlbertConfig, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() @@ -206,13 +205,11 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_albert_for_masked_lm( - *config_and_inputs) + self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs) def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_albert_for_sequence_classification( - *config_and_inputs) + self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs) @slow def test_model_from_pretrained(self): diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py index 2ad39ddccf..d695474ecf 100644 --- a/transformers/tests/modeling_tf_auto_test.py +++ b/transformers/tests/modeling_tf_auto_test.py @@ -25,14 +25,21 @@ from transformers import is_tf_available from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER if is_tf_available(): - from transformers import (AutoConfig, BertConfig, - TFAutoModel, TFBertModel, - TFAutoModelWithLMHead, TFBertForMaskedLM, - TFAutoModelForSequenceClassification, TFBertForSequenceClassification, - TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering) + from transformers import ( + AutoConfig, + BertConfig, + TFAutoModel, + TFBertModel, + TFAutoModelWithLMHead, + TFBertForMaskedLM, + TFAutoModelForSequenceClassification, + TFBertForSequenceClassification, + TFAutoModelForQuestionAnswering, + TFBertForQuestionAnswering, + ) from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP - from .modeling_common_test import (CommonTestCases, ids_tensor) + from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester @@ -41,11 +48,12 @@ class TFAutoModelTest(unittest.TestCase): @slow def test_model_from_pretrained(self): import h5py + self.assertTrue(h5py.version.hdf5_version.startswith("1.10")) logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - for model_name in ['bert-base-uncased']: + for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) @@ -58,7 +66,7 @@ class TFAutoModelTest(unittest.TestCase): def test_lmhead_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - for model_name in ['bert-base-uncased']: + for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) @@ -71,7 +79,7 @@ class TFAutoModelTest(unittest.TestCase): def test_sequence_classification_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - for model_name in ['bert-base-uncased']: + for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) @@ -84,7 +92,7 @@ class TFAutoModelTest(unittest.TestCase): def test_question_answering_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - for model_name in ['bert-base-uncased']: + for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py index abf20b1514..e36e3a2c3f 100644 --- a/transformers/tests/modeling_tf_bert_test.py +++ b/transformers/tests/modeling_tf_bert_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import sys -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,49 +27,62 @@ from transformers import BertConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM, - TFBertForNextSentencePrediction, - TFBertForPreTraining, - TFBertForSequenceClassification, - TFBertForMultipleChoice, - TFBertForTokenClassification, - TFBertForQuestionAnswering, - TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_bert import ( + TFBertModel, + TFBertForMaskedLM, + TFBertForNextSentencePrediction, + TFBertForPreTraining, + TFBertForSequenceClassification, + TFBertForMultipleChoice, + TFBertForTokenClassification, + TFBertForQuestionAnswering, + TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) @require_tf class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction, - TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification, - TFBertForTokenClassification) if is_tf_available() else () + all_model_classes = ( + ( + TFBertModel, + TFBertForMaskedLM, + TFBertForNextSentencePrediction, + TFBertForPreTraining, + TFBertForQuestionAnswering, + TFBertForSequenceClassification, + TFBertForTokenClassification, + ) + if is_tf_available() + else () + ) class TFBertModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -123,15 +136,16 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFBertModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} sequence_output, pooled_output = model(inputs) inputs = [input_ids, input_mask] @@ -144,128 +158,119 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): "pooled_output": pooled_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) - - def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFBertForMaskedLM(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - prediction_scores, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (prediction_scores,) = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - - def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_next_sequence_prediction( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFBertForNextSentencePrediction(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - seq_relationship_score, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (seq_relationship_score,) = model(inputs) result = { "seq_relationship_score": seq_relationship_score.numpy(), } - self.parent.assertListEqual( - list(result["seq_relationship_score"].shape), - [self.batch_size, 2]) + self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) - - def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_pretraining( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFBertForPreTraining(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} prediction_scores, seq_relationship_score = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), "seq_relationship_score": seq_relationship_score.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) - self.parent.assertListEqual( - list(result["seq_relationship_score"].shape), - [self.batch_size, 2]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) + self.parent.assertListEqual(list(result["seq_relationship_score"].shape), [self.batch_size, 2]) - - def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = TFBertForSequenceClassification(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - logits, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) result = { "logits": logits.numpy(), } - self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) - - def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_choices = self.num_choices model = TFBertForMultipleChoice(config=config) multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) - inputs = {'input_ids': multiple_choice_inputs_ids, - 'attention_mask': multiple_choice_input_mask, - 'token_type_ids': multiple_choice_token_type_ids} - logits, = model(inputs) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + (logits,) = model(inputs) result = { "logits": logits.numpy(), } - self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.num_choices]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) - - def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = TFBertForTokenClassification(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - logits, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) result = { "logits": logits.numpy(), } self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.seq_length, self.num_labels]) + list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] + ) - - def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_bert_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFBertForQuestionAnswering(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} start_logits, end_logits = model(inputs) result = { "start_logits": start_logits.numpy(), "end_logits": end_logits.numpy(), } - self.parent.assertListEqual( - list(result["start_logits"].shape), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].shape), - [self.batch_size, self.seq_length]) - + self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -310,10 +315,10 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - for model_name in ['bert-base-uncased']: + for model_name in ["bert-base-uncased"]: model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() - diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index 5a5873e81b..d65e270ae1 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -35,6 +35,7 @@ if is_tf_available(): import tensorflow as tf import numpy as np from transformers import TFPreTrainedModel + # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP if sys.version_info[0] == 2: @@ -42,25 +43,31 @@ if sys.version_info[0] == 2: class TemporaryDirectory(object): """Context manager for tempfile.mkdtemp() so it's usable with "with" statement.""" + def __enter__(self): self.name = tempfile.mkdtemp() return self.name + def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.name) + + else: import pickle + TemporaryDirectory = tempfile.TemporaryDirectory unicode = str + def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if '_range' in key or '_std' in key: + if "_range" in key or "_std" in key: setattr(configs_no_init, key, 0.0) return configs_no_init -class TFCommonTestCases: +class TFCommonTestCases: @require_tf class TFCommonModelTester(unittest.TestCase): @@ -126,8 +133,9 @@ class TFCommonTestCases: # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() - pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long)) - for name, key in inputs_dict.items()) + pt_inputs_dict = dict( + (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() + ) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict, training=False) @@ -140,18 +148,19 @@ class TFCommonTestCases: # Check we can load pt model in tf and vice-versa with checkpoint => model functions with TemporaryDirectory() as tmpdirname: - pt_checkpoint_path = os.path.join(tmpdirname, 'pt_model.bin') + pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin") torch.save(pt_model.state_dict(), pt_checkpoint_path) tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path) - tf_checkpoint_path = os.path.join(tmpdirname, 'tf_model.h5') + tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5") tf_model.save_weights(tf_checkpoint_path) pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path) # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences pt_model.eval() - pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long)) - for name, key in inputs_dict.items()) + pt_inputs_dict = dict( + (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items() + ) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict) @@ -166,13 +175,19 @@ class TFCommonTestCases: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if self.is_encoder_decoder: - input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'), - 'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')} + input_ids = { + "decoder_input_ids": tf.keras.Input( + batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32" + ), + "encoder_input_ids": tf.keras.Input( + batch_shape=(2, 2000), name="encoder_input_ids", dtype="int32" + ), + } else: - input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32') + input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32") optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") for model_class in self.all_model_classes: # Prepare our model @@ -188,7 +203,7 @@ class TFCommonTestCases: hidden_states = outputs_dict[0] # Add a dense layer on top to test intetgration with other keras modules - outputs = tf.keras.layers.Dense(2, activation='softmax', name='outputs')(hidden_states) + outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states) # Compile extended model extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs]) @@ -202,7 +217,9 @@ class TFCommonTestCases: outputs_dict = model(inputs_dict) inputs_keywords = copy.deepcopy(inputs_dict) - input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None) + input_ids = inputs_keywords.pop( + "input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None + ) outputs_keywords = model(input_ids, **inputs_keywords) output_dict = outputs_dict[0].numpy() @@ -213,10 +230,22 @@ class TFCommonTestCases: def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length - encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length - decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length - encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length + decoder_seq_length = ( + self.model_tester.decoder_seq_length + if hasattr(self.model_tester, "decoder_seq_length") + else self.model_tester.seq_length + ) + encoder_seq_length = ( + self.model_tester.encoder_seq_length + if hasattr(self.model_tester, "encoder_seq_length") + else self.model_tester.seq_length + ) + decoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else decoder_seq_length + ) + encoder_key_length = ( + self.model_tester.key_length if hasattr(self.model_tester, "key_length") else encoder_seq_length + ) for model_class in self.all_model_classes: config.output_attentions = True @@ -229,22 +258,20 @@ class TFCommonTestCases: self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length]) + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) out_len = len(outputs) if self.is_encoder_decoder: self.assertEqual(out_len % 2, 0) - decoder_attentions = outputs[(out_len // 2)-1] + decoder_attentions = outputs[(out_len // 2) - 1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(decoder_attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, - decoder_seq_length, - decoder_key_length]) + [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length], + ) # Check attention is always last and order is fine config.output_attentions = True @@ -259,9 +286,8 @@ class TFCommonTestCases: self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( list(attentions[0].shape[-3:]), - [self.model_tester.num_attention_heads, - encoder_seq_length, - encoder_key_length]) + [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length], + ) def test_hidden_states_output(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -276,8 +302,8 @@ class TFCommonTestCases: self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertListEqual( - list(hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size]) + list(hidden_states[0].shape[-2:]), [self.model_tester.seq_length, self.model_tester.hidden_size] + ) def test_model_common_attributes(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -357,9 +383,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): for _ in range(total_dims): values.append(rng.randint(0, vocab_size - 1)) - output = tf.constant(values, - shape=shape, - dtype=dtype if dtype is not None else tf.int32) + output = tf.constant(values, shape=shape, dtype=dtype if dtype is not None else tf.int32) return output diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py index 93b231e517..fb8c4c2551 100644 --- a/transformers/tests/modeling_tf_ctrl_test.py +++ b/transformers/tests/modeling_tf_ctrl_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import sys -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,8 +27,7 @@ from transformers import CTRLConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel, - TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP @require_tf @@ -37,32 +36,32 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester): all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else () class TFCTRLModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_token_type_ids=True, - use_input_mask=True, - use_labels=True, - use_mc_token_ids=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -127,13 +126,21 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester): head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) - return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFCTRLModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} sequence_output = model(inputs)[0] inputs = [input_ids, None, input_mask] # None is the input for 'past' @@ -145,30 +152,36 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester): "sequence_output": sequence_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) - + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) def create_and_check_ctrl_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFCTRLLMHeadModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} prediction_scores = model(inputs)[0] result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, input_mask, head_mask, token_type_ids, - mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -192,6 +205,6 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester): model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() - diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py index f28b5c397b..3260f63d56 100644 --- a/transformers/tests/modeling_tf_distilbert_test.py +++ b/transformers/tests/modeling_tf_distilbert_test.py @@ -18,7 +18,7 @@ from __future__ import print_function import unittest -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -26,48 +26,58 @@ from transformers import DistilBertConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_distilbert import (TFDistilBertModel, - TFDistilBertForMaskedLM, - TFDistilBertForQuestionAnswering, - TFDistilBertForSequenceClassification) + from transformers.modeling_tf_distilbert import ( + TFDistilBertModel, + TFDistilBertForMaskedLM, + TFDistilBertForQuestionAnswering, + TFDistilBertForSequenceClassification, + ) @require_tf class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, - TFDistilBertForSequenceClassification) if is_tf_available() else None + all_model_classes = ( + ( + TFDistilBertModel, + TFDistilBertForMaskedLM, + TFDistilBertForQuestionAnswering, + TFDistilBertForSequenceClassification, + ) + if is_tf_available() + else None + ) test_pruning = True test_torchscript = True test_resize_embeddings = True test_head_masking = True class TFDistilBertModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=False, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -116,14 +126,16 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester): dropout=self.hidden_dropout_prob, attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels - def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_model( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFDistilBertModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask} + inputs = {"input_ids": input_ids, "attention_mask": input_mask} outputs = model(inputs) sequence_output = outputs[0] @@ -136,54 +148,51 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester): "sequence_output": sequence_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) - def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_masked_lm( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFDistilBertForMaskedLM(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask} + inputs = {"input_ids": input_ids, "attention_mask": input_mask} (prediction_scores,) = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_question_answering( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFDistilBertForQuestionAnswering(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask} + inputs = {"input_ids": input_ids, "attention_mask": input_mask} start_logits, end_logits = model(inputs) result = { "start_logits": start_logits.numpy(), "end_logits": end_logits.numpy(), } - self.parent.assertListEqual( - list(result["start_logits"].shape), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].shape), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) - def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_distilbert_for_sequence_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = TFDistilBertForSequenceClassification(config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask} + inputs = {"input_ids": input_ids, "attention_mask": input_mask} (logits,) = model(inputs) result = { "logits": logits.numpy(), } - self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.num_labels]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask} + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -215,5 +224,6 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester): # model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) # self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py index 90920342ba..09b7eb0710 100644 --- a/transformers/tests/modeling_tf_gpt2_test.py +++ b/transformers/tests/modeling_tf_gpt2_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import sys -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,45 +27,47 @@ from transformers import GPT2Config, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel, - TFGPT2DoubleHeadsModel, - TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_gpt2 import ( + TFGPT2Model, + TFGPT2LMHeadModel, + TFGPT2DoubleHeadsModel, + TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + ) @require_tf class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, - TFGPT2DoubleHeadsModel) if is_tf_available() else () + all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2DoubleHeadsModel) if is_tf_available() else () # all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel) if is_tf_available() else () class TFGPT2ModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_token_type_ids=True, - use_input_mask=True, - use_labels=True, - use_mc_token_ids=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -130,13 +132,21 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester): head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) - return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFGPT2Model(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} sequence_output = model(inputs)[0] inputs = [input_ids, None, input_mask] # None is the input for 'past' @@ -148,54 +158,58 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester): "sequence_output": sequence_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) - + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) def create_and_check_gpt2_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFGPT2LMHeadModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} prediction_scores = model(inputs)[0] result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - - def create_and_check_gpt2_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args): + def create_and_check_gpt2_double_head( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args + ): model = TFGPT2DoubleHeadsModel(config=config) multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) - inputs = {'input_ids': multiple_choice_inputs_ids, - 'mc_token_ids': mc_token_ids, - 'attention_mask': multiple_choice_input_mask, - 'token_type_ids': multiple_choice_token_type_ids} - lm_logits, mc_logits = model(inputs)[:2] - result = { - "lm_logits": lm_logits.numpy(), - "mc_logits": mc_logits.numpy() + inputs = { + "input_ids": multiple_choice_inputs_ids, + "mc_token_ids": mc_token_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, } + lm_logits, mc_logits = model(inputs)[:2] + result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()} self.parent.assertListEqual( - list(result["lm_logits"].shape), - [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]) - self.parent.assertListEqual( - list(result["mc_logits"].shape), - [self.batch_size, self.num_choices]) + list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size] + ) + self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, input_mask, head_mask, token_type_ids, - mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -223,6 +237,6 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester): model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() - diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py index 065bf2acde..a59395e02b 100644 --- a/transformers/tests/modeling_tf_openai_gpt_test.py +++ b/transformers/tests/modeling_tf_openai_gpt_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import sys -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,44 +27,48 @@ from transformers import OpenAIGPTConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, - TFOpenAIGPTDoubleHeadsModel, - TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_openai import ( + TFOpenAIGPTModel, + TFOpenAIGPTLMHeadModel, + TFOpenAIGPTDoubleHeadsModel, + TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) @require_tf class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, - TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else () + all_model_classes = ( + (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel) if is_tf_available() else () + ) class TFOpenAIGPTModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_token_type_ids=True, - use_input_mask=True, - use_labels=True, - use_mc_token_ids=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_token_type_ids=True, + use_input_mask=True, + use_labels=True, + use_mc_token_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -129,13 +133,21 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester): head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) - return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels + return ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) def create_and_check_openai_gpt_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFOpenAIGPTModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} sequence_output = model(inputs)[0] inputs = [input_ids, input_mask] @@ -147,54 +159,58 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester): "sequence_output": sequence_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) - + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) def create_and_check_openai_gpt_lm_head(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = TFOpenAIGPTLMHeadModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} prediction_scores = model(inputs)[0] result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - - def create_and_check_openai_gpt_double_head(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args): + def create_and_check_openai_gpt_double_head( + self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args + ): model = TFOpenAIGPTDoubleHeadsModel(config=config) multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) - inputs = {'input_ids': multiple_choice_inputs_ids, - 'mc_token_ids': mc_token_ids, - 'attention_mask': multiple_choice_input_mask, - 'token_type_ids': multiple_choice_token_type_ids} - lm_logits, mc_logits = model(inputs)[:2] - result = { - "lm_logits": lm_logits.numpy(), - "mc_logits": mc_logits.numpy() + inputs = { + "input_ids": multiple_choice_inputs_ids, + "mc_token_ids": mc_token_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, } + lm_logits, mc_logits = model(inputs)[:2] + result = {"lm_logits": lm_logits.numpy(), "mc_logits": mc_logits.numpy()} self.parent.assertListEqual( - list(result["lm_logits"].shape), - [self.batch_size, self.num_choices, self.seq_length, self.vocab_size]) - self.parent.assertListEqual( - list(result["mc_logits"].shape), - [self.batch_size, self.num_choices]) + list(result["lm_logits"].shape), [self.batch_size, self.num_choices, self.seq_length, self.vocab_size] + ) + self.parent.assertListEqual(list(result["mc_logits"].shape), [self.batch_size, self.num_choices]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, input_mask, head_mask, token_type_ids, - mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs + ( + config, + input_ids, + input_mask, + head_mask, + token_type_ids, + mc_token_ids, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -222,6 +238,6 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester): model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() - diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py index 93c478ae28..23ea557404 100644 --- a/transformers/tests/modeling_tf_roberta_test.py +++ b/transformers/tests/modeling_tf_roberta_test.py @@ -18,7 +18,7 @@ from __future__ import print_function import unittest -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,44 +27,48 @@ from transformers import RobertaConfig, is_tf_available if is_tf_available(): import tensorflow as tf import numpy - from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM, - TFRobertaForSequenceClassification, - TFRobertaForTokenClassification, - TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_roberta import ( + TFRobertaModel, + TFRobertaForMaskedLM, + TFRobertaForSequenceClassification, + TFRobertaForTokenClassification, + TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + ) @require_tf class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM, - TFRobertaForSequenceClassification) if is_tf_available() else () + all_model_classes = ( + (TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification) if is_tf_available() else () + ) class TFRobertaModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -118,16 +122,16 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + initializer_range=self.initializer_range, + ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, - token_labels, choice_labels): + def create_and_check_roberta_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFRobertaModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} sequence_output = model(inputs)[0] inputs = [input_ids, input_mask] @@ -139,39 +143,47 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): "sequence_output": sequence_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) - def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, - token_labels, choice_labels): + def create_and_check_roberta_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): model = TFRobertaForMaskedLM(config=config) prediction_scores = model([input_ids, input_mask, token_type_ids])[0] result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_roberta_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): config.num_labels = self.num_labels model = TFRobertaForTokenClassification(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - logits, = model(inputs) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) result = { "logits": logits.numpy(), } self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.seq_length, self.num_labels]) + list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels] + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} return config, inputs_dict def setUp(self): @@ -196,61 +208,43 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): self.assertIsNotNone(model) - class TFRobertaModelIntegrationTest(unittest.TestCase): - @slow def test_inference_masked_lm(self): - model = TFRobertaForMaskedLM.from_pretrained('roberta-base') + model = TFRobertaForMaskedLM.from_pretrained("roberta-base") - input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = [1, 11, 50265] - self.assertEqual( - list(output.numpy().shape), - expected_shape - ) + self.assertEqual(list(output.numpy().shape), expected_shape) # compare the actual values for a slice. expected_slice = tf.constant( - [[[33.8843, -4.3107, 22.7779], - [ 4.6533, -2.8099, 13.6252], - [ 1.8222, -3.6898, 8.8600]]] - ) - self.assertTrue( - numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3) + [[[33.8843, -4.3107, 22.7779], [4.6533, -2.8099, 13.6252], [1.8222, -3.6898, 8.8600]]] ) + self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)) @slow def test_inference_no_head(self): - model = TFRobertaModel.from_pretrained('roberta-base') + model = TFRobertaModel.from_pretrained("roberta-base") - input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] # compare the actual values for a slice. expected_slice = tf.constant( - [[[-0.0231, 0.0782, 0.0074], - [-0.1854, 0.0539, -0.0174], - [ 0.0548, 0.0799, 0.1687]]] - ) - self.assertTrue( - numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3) + [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0539, -0.0174], [0.0548, 0.0799, 0.1687]]] ) + self.assertTrue(numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)) @slow def test_inference_classification_head(self): - model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli') + model = TFRobertaForSequenceClassification.from_pretrained("roberta-large-mnli") - input_ids = tf.constant([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) output = model(input_ids)[0] expected_shape = [1, 3] - self.assertEqual( - list(output.numpy().shape), - expected_shape - ) - expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]]) - self.assertTrue( - numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3) - ) + self.assertEqual(list(output.numpy().shape), expected_shape) + expected_tensor = tf.constant([[-0.9469, 0.3913, 0.5118]]) + self.assertTrue(numpy.allclose(output.numpy(), expected_tensor.numpy(), atol=1e-3)) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py index da9ce6f89d..521085219b 100644 --- a/transformers/tests/modeling_tf_t5_test.py +++ b/transformers/tests/modeling_tf_t5_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import sys -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,8 +27,7 @@ from transformers import T5Config, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel, - TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP @require_tf @@ -38,25 +37,25 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else () class TFT5ModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_labels=True, - vocab_size=99, - n_positions=14, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - d_ff=37, - relative_attention_num_buckets=8, - dropout_rate=0.1, - initializer_factor=0.002, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + n_positions=14, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + d_ff=37, + relative_attention_num_buckets=8, + dropout_rate=0.1, + initializer_factor=0.002, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -95,53 +94,58 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): num_heads=self.num_attention_heads, relative_attention_num_buckets=self.relative_attention_num_buckets, dropout_rate=self.dropout_rate, - initializer_factor=self.initializer_factor) + initializer_factor=self.initializer_factor, + ) return (config, input_ids, input_mask, token_labels) def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels): model = TFT5Model(config=config) - inputs = {'encoder_input_ids': input_ids, - 'decoder_input_ids': input_ids, - 'decoder_attention_mask': input_mask} + inputs = { + "encoder_input_ids": input_ids, + "decoder_input_ids": input_ids, + "decoder_attention_mask": input_mask, + } encoder_output, decoder_output = model(inputs) - encoder_output, decoder_output = model(input_ids, - decoder_attention_mask=input_mask, - encoder_input_ids=input_ids) + encoder_output, decoder_output = model( + input_ids, decoder_attention_mask=input_mask, encoder_input_ids=input_ids + ) result = { "encoder_output": encoder_output.numpy(), "decoder_output": decoder_output.numpy(), } self.parent.assertListEqual( - list(result["encoder_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["encoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual( - list(result["decoder_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) - + list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): model = TFT5WithLMHeadModel(config=config) - inputs = {'encoder_input_ids': input_ids, - 'decoder_input_ids': input_ids, - 'decoder_attention_mask': input_mask} + inputs = { + "encoder_input_ids": input_ids, + "decoder_input_ids": input_ids, + "decoder_attention_mask": input_mask, + } prediction_scores, decoder_output = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), } self.parent.assertListEqual( - list(result["prediction_scores"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) - + list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, input_mask, token_labels) = config_and_inputs - inputs_dict = {'encoder_input_ids': input_ids, - 'decoder_input_ids': input_ids, - 'decoder_attention_mask': input_mask} + inputs_dict = { + "encoder_input_ids": input_ids, + "decoder_input_ids": input_ids, + "decoder_attention_mask": input_mask, + } return config, inputs_dict def setUp(self): @@ -161,9 +165,10 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - for model_name in ['t5-small']: + for model_name in ["t5-small"]: model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py index 8225c09275..20de598d05 100644 --- a/transformers/tests/modeling_tf_transfo_xl_test.py +++ b/transformers/tests/modeling_tf_transfo_xl_test.py @@ -19,7 +19,7 @@ from __future__ import print_function import unittest import random -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -27,9 +27,11 @@ from transformers import TransfoXLConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel, - TFTransfoXLLMHeadModel, - TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_transfo_xl import ( + TFTransfoXLModel, + TFTransfoXLLMHeadModel, + TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + ) @require_tf @@ -41,27 +43,27 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): test_resize_embeddings = False class TFTransfoXLModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - mem_len=30, - clamp_len=15, - is_training=True, - use_labels=True, - vocab_size=99, - cutoffs=[10, 50, 80], - hidden_size=32, - d_embed=32, - num_attention_heads=4, - d_head=8, - d_inner=128, - div_val=2, - num_hidden_layers=5, - scope=None, - seed=1, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + mem_len=30, + clamp_len=15, + is_training=True, + use_labels=True, + vocab_size=99, + cutoffs=[10, 50, 80], + hidden_size=32, + d_embed=32, + num_attention_heads=4, + d_head=8, + d_inner=128, + div_val=2, + num_hidden_layers=5, + scope=None, + seed=1, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -101,7 +103,8 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): d_head=self.d_head, d_inner=self.d_inner, div_val=self.div_val, - n_layer=self.num_hidden_layers) + n_layer=self.num_hidden_layers, + ) return (config, input_ids_1, input_ids_2, lm_labels) @@ -114,8 +117,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): hidden_states_1, mems_1 = model(input_ids_1) - inputs = {'input_ids': input_ids_2, - 'mems': mems_1} + inputs = {"input_ids": input_ids_2, "mems": mems_1} hidden_states_2, mems_2 = model(inputs) @@ -127,33 +129,31 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): } self.parent.assertListEqual( - list(result["hidden_states_1"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["hidden_states_1"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual( - list(result["hidden_states_2"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["hidden_states_2"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_1"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_2"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) - + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): model = TFTransfoXLLMHeadModel(config) lm_logits_1, mems_1 = model(input_ids_1) - inputs = {'input_ids': input_ids_1, - 'labels': lm_labels} + inputs = {"input_ids": input_ids_1, "labels": lm_labels} _, mems_1 = model(inputs) lm_logits_2, mems_2 = model([input_ids_2, mems_1]) - inputs = {'input_ids': input_ids_1, - 'mems': mems_1, - 'labels': lm_labels} + inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels} _, mems_2 = model(inputs) @@ -165,26 +165,27 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): } self.parent.assertListEqual( - list(result["lm_logits_1"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["lm_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_1"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) self.parent.assertListEqual( - list(result["lm_logits_2"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["lm_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_2"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids_1} + inputs_dict = {"input_ids": input_ids_1} return config, inputs_dict - def setUp(self): self.model_tester = TFTransfoXLModelTest.TFTransfoXLModelTester(self) self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37) diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py index 065d355b45..9162bf2b38 100644 --- a/transformers/tests/modeling_tf_xlm_test.py +++ b/transformers/tests/modeling_tf_xlm_test.py @@ -22,13 +22,16 @@ from transformers import is_tf_available if is_tf_available(): import tensorflow as tf - from transformers import (XLMConfig, TFXLMModel, - TFXLMWithLMHeadModel, - TFXLMForSequenceClassification, - TFXLMForQuestionAnsweringSimple, - TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers import ( + XLMConfig, + TFXLMModel, + TFXLMWithLMHeadModel, + TFXLMForSequenceClassification, + TFXLMForQuestionAnsweringSimple, + TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + ) -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -36,43 +39,44 @@ from .utils import CACHE_DIR, require_tf, slow @require_tf class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel, - TFXLMForSequenceClassification, - TFXLMForQuestionAnsweringSimple) if is_tf_available() else () - + all_model_classes = ( + (TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple) + if is_tf_available() + else () + ) class TFXLMModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_lengths=True, - use_token_type_ids=True, - use_labels=True, - gelu_activation=True, - sinusoidal_embeddings=False, - causal=False, - asm=False, - n_langs=2, - vocab_size=99, - n_special=0, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - summary_type="last", - use_proj=True, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_lengths=True, + use_token_type_ids=True, + use_labels=True, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=2, + vocab_size=99, + n_special=0, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + summary_type="last", + use_proj=True, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -109,7 +113,9 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): input_lengths = None if self.use_input_lengths: - input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 # small variation of seq_length + input_lengths = ( + ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 + ) # small variation of seq_length token_type_ids = None if self.use_token_type_ids: @@ -124,30 +130,48 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) config = XLMConfig( - vocab_size=self.vocab_size, - n_special=self.n_special, - emb_dim=self.hidden_size, - n_layers=self.num_hidden_layers, - n_heads=self.num_attention_heads, - dropout=self.hidden_dropout_prob, - attention_dropout=self.attention_probs_dropout_prob, - gelu_activation=self.gelu_activation, - sinusoidal_embeddings=self.sinusoidal_embeddings, - asm=self.asm, - causal=self.causal, - n_langs=self.n_langs, - max_position_embeddings=self.max_position_embeddings, - initializer_range=self.initializer_range, - summary_type=self.summary_type, - use_proj=self.use_proj) + vocab_size=self.vocab_size, + n_special=self.n_special, + emb_dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + gelu_activation=self.gelu_activation, + sinusoidal_embeddings=self.sinusoidal_embeddings, + asm=self.asm, + causal=self.causal, + n_langs=self.n_langs, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + summary_type=self.summary_type, + use_proj=self.use_proj, + ) - return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask + return ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ) - def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_model( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = TFXLMModel(config=config) - inputs = {'input_ids': input_ids, - 'lengths': input_lengths, - 'langs': token_type_ids} + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} outputs = model(inputs) inputs = [input_ids, input_mask] @@ -157,16 +181,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): "sequence_output": sequence_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) - - def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_lm_head( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = TFXLMWithLMHeadModel(config) - inputs = {'input_ids': input_ids, - 'lengths': input_lengths, - 'langs': token_type_ids} + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} outputs = model(inputs) logits = outputs[0] @@ -176,15 +207,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): } self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) - - def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = TFXLMForQuestionAnsweringSimple(config) - inputs = {'input_ids': input_ids, - 'lengths': input_lengths} + inputs = {"input_ids": input_ids, "lengths": input_lengths} outputs = model(inputs) start_logits, end_logits = model(inputs) @@ -194,19 +233,23 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): "end_logits": end_logits.numpy(), } - self.parent.assertListEqual( - list(result["start_logits"].shape), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].shape), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) - - def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_sequence_classif( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = TFXLMForSequenceClassification(config) - inputs = {'input_ids': input_ids, - 'lengths': input_lengths} + inputs = {"input_ids": input_ids, "lengths": input_lengths} (logits,) = model(inputs) @@ -214,16 +257,26 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): "logits": logits.numpy(), } - self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.type_sequence_label_size]) - + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_lengths, - sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'langs': token_type_ids, 'lengths': input_lengths} + ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "langs": token_type_ids, + "lengths": input_lengths, + } return config, inputs_dict def setUp(self): diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py index 15fd917481..9a56384a0c 100644 --- a/transformers/tests/modeling_tf_xlnet_test.py +++ b/transformers/tests/modeling_tf_xlnet_test.py @@ -26,13 +26,16 @@ from transformers import XLNetConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel, - TFXLNetForSequenceClassification, - TFXLNetForTokenClassification, - TFXLNetForQuestionAnsweringSimple, - TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) + from transformers.modeling_tf_xlnet import ( + TFXLNetModel, + TFXLNetLMHeadModel, + TFXLNetForSequenceClassification, + TFXLNetForTokenClassification, + TFXLNetForQuestionAnsweringSimple, + TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + ) -from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .modeling_tf_common_test import TFCommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_tf, slow @@ -40,37 +43,44 @@ from .utils import CACHE_DIR, require_tf, slow @require_tf class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel, - TFXLNetForSequenceClassification, - TFXLNetForTokenClassification, - TFXLNetForQuestionAnsweringSimple) if is_tf_available() else () + all_model_classes = ( + ( + TFXLNetModel, + TFXLNetLMHeadModel, + TFXLNetForSequenceClassification, + TFXLNetForTokenClassification, + TFXLNetForQuestionAnsweringSimple, + ) + if is_tf_available() + else () + ) test_pruning = False class TFXLNetModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - mem_len=10, - clamp_len=-1, - reuse_len=15, - is_training=True, - use_labels=True, - vocab_size=99, - cutoffs=[10, 50, 80], - hidden_size=32, - num_attention_heads=4, - d_inner=128, - num_hidden_layers=5, - type_sequence_label_size=2, - untie_r=True, - bi_data=False, - same_length=False, - initializer_range=0.05, - seed=1, - type_vocab_size=2, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + mem_len=10, + clamp_len=-1, + reuse_len=15, + is_training=True, + use_labels=True, + vocab_size=99, + cutoffs=[10, 50, 80], + hidden_size=32, + num_attention_heads=4, + d_inner=128, + num_hidden_layers=5, + type_sequence_label_size=2, + untie_r=True, + bi_data=False, + same_length=False, + initializer_range=0.05, + seed=1, + type_vocab_size=2, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -131,22 +141,44 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): reuse_len=self.reuse_len, bi_data=self.bi_data, initializer_range=self.initializer_range, - num_labels=self.type_sequence_label_size) + num_labels=self.type_sequence_label_size, + ) - return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels) + return ( + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ) def set_seed(self): random.seed(self.seed) tf.random.set_seed(self.seed) - def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): + def create_and_check_xlnet_base_model( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ): model = TFXLNetModel(config) - inputs = {'input_ids': input_ids_1, - 'input_mask': input_mask, - 'token_type_ids': segment_ids} + inputs = {"input_ids": input_ids_1, "input_mask": input_mask, "token_type_ids": segment_ids} _, _ = model(inputs) @@ -165,30 +197,38 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): self.parent.assertEqual(len(no_mems_outputs), 1) self.parent.assertListEqual( - list(result["outputs"].shape), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["outputs"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) - def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): + def create_and_check_xlnet_lm_head( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ): model = TFXLNetLMHeadModel(config) - inputs_1 = {'input_ids': input_ids_1, - 'token_type_ids': segment_ids} + inputs_1 = {"input_ids": input_ids_1, "token_type_ids": segment_ids} all_logits_1, mems_1 = model(inputs_1) - inputs_2 = {'input_ids': input_ids_2, - 'mems': mems_1, - 'token_type_ids': segment_ids} + inputs_2 = {"input_ids": input_ids_2, "mems": mems_1, "token_type_ids": segment_ids} all_logits_2, mems_2 = model(inputs_2) - inputs_3 = {'input_ids': input_ids_q, - 'perm_mask': perm_mask, - 'target_mapping': target_mapping} + inputs_3 = {"input_ids": input_ids_q, "perm_mask": perm_mask, "target_mapping": target_mapping} logits, _ = model(inputs_3) @@ -200,26 +240,38 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): } self.parent.assertListEqual( - list(result["all_logits_1"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["all_logits_1"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) self.parent.assertListEqual( - list(result["all_logits_2"].shape), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["all_logits_2"].shape), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_2"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) - def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): + def create_and_check_xlnet_qa( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ): model = TFXLNetForQuestionAnsweringSimple(config) - inputs = {'input_ids': input_ids_1, - 'attention_mask': input_mask, - 'token_type_ids': segment_ids} + inputs = {"input_ids": input_ids_1, "attention_mask": input_mask, "token_type_ids": segment_ids} start_logits, end_logits, mems = model(inputs) result = { @@ -228,18 +280,27 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): "mems": [m.numpy() for m in mems], } - self.parent.assertListEqual( - list(result["start_logits"].shape), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].shape), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) - def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): + def create_and_check_xlnet_sequence_classif( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ): model = TFXLNetForSequenceClassification(config) logits, mems_1 = model(input_ids_1) @@ -249,42 +310,64 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): "logits": logits.numpy(), } - self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.type_sequence_label_size]) + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) - def create_and_check_xlnet_for_token_classification(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): + def create_and_check_xlnet_for_token_classification( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ): config.num_labels = input_ids_1.shape[1] model = TFXLNetForTokenClassification(config) - inputs = {'input_ids': input_ids_1, - 'attention_mask': input_mask, - # 'token_type_ids': token_type_ids - } + inputs = { + "input_ids": input_ids_1, + "attention_mask": input_mask, + # 'token_type_ids': token_type_ids + } logits, mems_1 = model(inputs) result = { "mems_1": [mem.numpy() for mem in mems_1], "logits": logits.numpy(), } self.parent.assertListEqual( - list(result["logits"].shape), - [self.batch_size, self.seq_length, config.num_labels]) + list(result["logits"].shape), [self.batch_size, self.seq_length, config.num_labels] + ) self.parent.assertListEqual( list(list(mem.shape) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, - sequence_labels, is_impossible_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids_1} + ( + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids_1} return config, inputs_dict - def setUp(self): self.model_tester = TFXLNetModelTest.TFXLNetModelTester(self) self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37) diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py index acbe95fe4a..f04205d4e0 100644 --- a/transformers/tests/modeling_transfo_xl_test.py +++ b/transformers/tests/modeling_transfo_xl_test.py @@ -23,10 +23,10 @@ from transformers import is_torch_available if is_torch_available(): import torch - from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) + from transformers import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -40,27 +40,27 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): test_resize_embeddings = False class TransfoXLModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - mem_len=30, - clamp_len=15, - is_training=True, - use_labels=True, - vocab_size=99, - cutoffs=[10, 50, 80], - hidden_size=32, - d_embed=32, - num_attention_heads=4, - d_head=8, - d_inner=128, - div_val=2, - num_hidden_layers=5, - scope=None, - seed=1, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + mem_len=30, + clamp_len=15, + is_training=True, + use_labels=True, + vocab_size=99, + cutoffs=[10, 50, 80], + hidden_size=32, + d_embed=32, + num_attention_heads=4, + d_head=8, + d_inner=128, + div_val=2, + num_hidden_layers=5, + scope=None, + seed=1, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -100,7 +100,8 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): d_head=self.d_head, d_inner=self.d_inner, div_val=self.div_val, - n_layer=self.num_hidden_layers) + n_layer=self.num_hidden_layers, + ) return (config, input_ids_1, input_ids_2, lm_labels) @@ -125,18 +126,19 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): def check_transfo_xl_model_output(self, result): self.parent.assertListEqual( - list(result["hidden_states_1"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["hidden_states_1"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual( - list(result["hidden_states_2"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["hidden_states_2"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_2"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) - + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels): model = TransfoXLLMHeadModel(config) @@ -159,33 +161,30 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): return outputs def check_transfo_xl_lm_head_output(self, result): + self.parent.assertListEqual(list(result["loss_1"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual( - list(result["loss_1"].size()), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["lm_logits_1"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["lm_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) + self.parent.assertListEqual(list(result["loss_2"].size()), [self.batch_size, self.seq_length]) self.parent.assertListEqual( - list(result["loss_2"].size()), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["lm_logits_2"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["lm_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_2"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids_1} + inputs_dict = {"input_ids": input_ids_1} return config, inputs_dict - def setUp(self): self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self) self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37) diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py index fcc2f4699b..843693fd03 100644 --- a/transformers/tests/modeling_xlm_test.py +++ b/transformers/tests/modeling_xlm_test.py @@ -21,11 +21,17 @@ import unittest from transformers import is_torch_available if is_torch_available(): - from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, - XLMForSequenceClassification, XLMForQuestionAnsweringSimple) + from transformers import ( + XLMConfig, + XLMModel, + XLMWithLMHeadModel, + XLMForQuestionAnswering, + XLMForSequenceClassification, + XLMForQuestionAnsweringSimple, + ) from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -33,42 +39,50 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch class XLMModelTest(CommonTestCases.CommonModelTester): - all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, - XLMForSequenceClassification, XLMForQuestionAnsweringSimple) if is_torch_available() else () - + all_model_classes = ( + ( + XLMModel, + XLMWithLMHeadModel, + XLMForQuestionAnswering, + XLMForSequenceClassification, + XLMForQuestionAnsweringSimple, + ) + if is_torch_available() + else () + ) class XLMModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_lengths=True, - use_token_type_ids=True, - use_labels=True, - gelu_activation=True, - sinusoidal_embeddings=False, - causal=False, - asm=False, - n_langs=2, - vocab_size=99, - n_special=0, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - summary_type="last", - use_proj=True, - scope=None, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_lengths=True, + use_token_type_ids=True, + use_labels=True, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=2, + vocab_size=99, + n_special=0, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + summary_type="last", + use_proj=True, + scope=None, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -105,7 +119,9 @@ class XLMModelTest(CommonTestCases.CommonModelTester): input_lengths = None if self.use_input_lengths: - input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 # small variation of seq_length + input_lengths = ( + ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 + ) # small variation of seq_length token_type_ids = None if self.use_token_type_ids: @@ -120,31 +136,49 @@ class XLMModelTest(CommonTestCases.CommonModelTester): is_impossible_labels = ids_tensor([self.batch_size], 2).float() config = XLMConfig( - vocab_size=self.vocab_size, - n_special=self.n_special, - emb_dim=self.hidden_size, - n_layers=self.num_hidden_layers, - n_heads=self.num_attention_heads, - dropout=self.hidden_dropout_prob, - attention_dropout=self.attention_probs_dropout_prob, - gelu_activation=self.gelu_activation, - sinusoidal_embeddings=self.sinusoidal_embeddings, - asm=self.asm, - causal=self.causal, - n_langs=self.n_langs, - max_position_embeddings=self.max_position_embeddings, - initializer_range=self.initializer_range, - summary_type=self.summary_type, - use_proj=self.use_proj) + vocab_size=self.vocab_size, + n_special=self.n_special, + emb_dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + gelu_activation=self.gelu_activation, + sinusoidal_embeddings=self.sinusoidal_embeddings, + asm=self.asm, + causal=self.causal, + n_langs=self.n_langs, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + summary_type=self.summary_type, + use_proj=self.use_proj, + ) - return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask + return ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ) def check_loss_output(self, result): - self.parent.assertListEqual( - list(result["loss"].size()), - []) + self.parent.assertListEqual(list(result["loss"].size()), []) - def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_model( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = XLMModel(config=config) model.to(torch_device) model.eval() @@ -156,11 +190,20 @@ class XLMModelTest(CommonTestCases.CommonModelTester): "sequence_output": sequence_output, } self.parent.assertListEqual( - list(result["sequence_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) - - def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_lm_head( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = XLMWithLMHeadModel(config) model.to(torch_device) model.eval() @@ -172,23 +215,29 @@ class XLMModelTest(CommonTestCases.CommonModelTester): "logits": logits, } + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) - - def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_simple_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = XLMForQuestionAnsweringSimple(config) model.to(torch_device) model.eval() outputs = model(input_ids) - outputs = model(input_ids, start_positions=sequence_labels, - end_positions=sequence_labels) + outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) loss, start_logits, end_logits = outputs result = { @@ -196,16 +245,21 @@ class XLMModelTest(CommonTestCases.CommonModelTester): "start_logits": start_logits, "end_logits": end_logits, } - self.parent.assertListEqual( - list(result["start_logits"].size()), - [self.batch_size, self.seq_length]) - self.parent.assertListEqual( - list(result["end_logits"].size()), - [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["start_logits"].size()), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].size()), [self.batch_size, self.seq_length]) self.check_loss_output(result) - - def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = XLMForQuestionAnswering(config) model.to(torch_device) model.eval() @@ -213,21 +267,26 @@ class XLMModelTest(CommonTestCases.CommonModelTester): outputs = model(input_ids) start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = outputs - outputs = model(input_ids, start_positions=sequence_labels, - end_positions=sequence_labels, - cls_index=sequence_labels, - is_impossible=is_impossible_labels, - p_mask=input_mask) + outputs = model( + input_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + cls_index=sequence_labels, + is_impossible=is_impossible_labels, + p_mask=input_mask, + ) - outputs = model(input_ids, start_positions=sequence_labels, - end_positions=sequence_labels, - cls_index=sequence_labels, - is_impossible=is_impossible_labels) + outputs = model( + input_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + cls_index=sequence_labels, + is_impossible=is_impossible_labels, + ) (total_loss,) = outputs - outputs = model(input_ids, start_positions=sequence_labels, - end_positions=sequence_labels) + outputs = model(input_ids, start_positions=sequence_labels, end_positions=sequence_labels) (total_loss,) = outputs @@ -240,27 +299,34 @@ class XLMModelTest(CommonTestCases.CommonModelTester): "cls_logits": cls_logits, } + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) + list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top] + ) self.parent.assertListEqual( - list(result["start_top_log_probs"].size()), - [self.batch_size, model.config.start_n_top]) - self.parent.assertListEqual( - list(result["start_top_index"].size()), - [self.batch_size, model.config.start_n_top]) + list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top] + ) self.parent.assertListEqual( list(result["end_top_log_probs"].size()), - [self.batch_size, model.config.start_n_top * model.config.end_n_top]) + [self.batch_size, model.config.start_n_top * model.config.end_n_top], + ) self.parent.assertListEqual( list(result["end_top_index"].size()), - [self.batch_size, model.config.start_n_top * model.config.end_n_top]) - self.parent.assertListEqual( - list(result["cls_logits"].size()), - [self.batch_size]) + [self.batch_size, model.config.start_n_top * model.config.end_n_top], + ) + self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size]) - - def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + def create_and_check_xlm_sequence_classif( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ): model = XLMForSequenceClassification(config) model.to(torch_device) model.eval() @@ -273,19 +339,24 @@ class XLMModelTest(CommonTestCases.CommonModelTester): "logits": logits, } + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.type_sequence_label_size]) - + list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size] + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_lengths, - sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths} + ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + input_mask, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths} return config, inputs_dict def setUp(self): diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py index 6d218d6ef4..487756a5c8 100644 --- a/transformers/tests/modeling_xlnet_test.py +++ b/transformers/tests/modeling_xlnet_test.py @@ -26,11 +26,17 @@ from transformers import is_torch_available if is_torch_available(): import torch - from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, - XLNetForTokenClassification, XLNetForQuestionAnswering) + from transformers import ( + XLNetConfig, + XLNetModel, + XLNetLMHeadModel, + XLNetForSequenceClassification, + XLNetForTokenClassification, + XLNetForQuestionAnswering, + ) from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import CommonTestCases, ids_tensor from .configuration_common_test import ConfigTester from .utils import CACHE_DIR, require_torch, slow, torch_device @@ -38,35 +44,44 @@ from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch class XLNetModelTest(CommonTestCases.CommonModelTester): - all_model_classes=(XLNetModel, XLNetLMHeadModel, XLNetForTokenClassification, - XLNetForSequenceClassification, XLNetForQuestionAnswering) if is_torch_available() else () + all_model_classes = ( + ( + XLNetModel, + XLNetLMHeadModel, + XLNetForTokenClassification, + XLNetForSequenceClassification, + XLNetForQuestionAnswering, + ) + if is_torch_available() + else () + ) test_pruning = False class XLNetModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - mem_len=10, - clamp_len=-1, - reuse_len=15, - is_training=True, - use_labels=True, - vocab_size=99, - cutoffs=[10, 50, 80], - hidden_size=32, - num_attention_heads=4, - d_inner=128, - num_hidden_layers=5, - type_sequence_label_size=2, - untie_r=True, - bi_data=False, - same_length=False, - initializer_range=0.05, - seed=1, - type_vocab_size=2, - ): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + mem_len=10, + clamp_len=-1, + reuse_len=15, + is_training=True, + use_labels=True, + vocab_size=99, + cutoffs=[10, 50, 80], + hidden_size=32, + num_attention_heads=4, + d_inner=128, + num_hidden_layers=5, + type_sequence_label_size=2, + untie_r=True, + bi_data=False, + same_length=False, + initializer_range=0.05, + seed=1, + type_vocab_size=2, + ): self.parent = parent self.batch_size = batch_size self.seq_length = seq_length @@ -97,9 +112,13 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float() input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size) - perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device) + perm_mask = torch.zeros( + self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device + ) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token - target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device) + target_mapping = torch.zeros( + self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device + ) target_mapping[:, 0, -1] = 1.0 # predict last token sequence_labels = None @@ -125,17 +144,43 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): reuse_len=self.reuse_len, bi_data=self.bi_data, initializer_range=self.initializer_range, - num_labels=self.type_sequence_label_size) + num_labels=self.type_sequence_label_size, + ) - return (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels) + return ( + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ) def set_seed(self): random.seed(self.seed) torch.manual_seed(self.seed) - def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels): + def create_and_check_xlnet_base_model( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ): model = XLNetModel(config) model.to(torch_device) model.eval() @@ -158,14 +203,28 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): self.parent.assertEqual(len(no_mems_outputs), 1) self.parent.assertListEqual( - list(result["outputs"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + list(result["outputs"].size()), [self.batch_size, self.seq_length, self.hidden_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) - def create_and_check_xlnet_base_model_with_att_output(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels): + def create_and_check_xlnet_base_model_with_att_output( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ): model = XLNetModel(config) model.to(torch_device) model.eval() @@ -177,15 +236,30 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): self.parent.assertEqual(len(attentions[0]), 2) self.parent.assertTrue(attentions[0][0].shape, attentions[0][0].shape) - def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels): + def create_and_check_xlnet_lm_head( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ): model = XLNetLMHeadModel(config) model.to(torch_device) model.eval() loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels) - loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1) + loss_2, all_logits_2, mems_2 = model( + input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1 + ) logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping) @@ -198,28 +272,39 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): "all_logits_2": all_logits_2, } + self.parent.assertListEqual(list(result["loss_1"].size()), []) self.parent.assertListEqual( - list(result["loss_1"].size()), - []) - self.parent.assertListEqual( - list(result["all_logits_1"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["all_logits_1"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) + self.parent.assertListEqual(list(result["loss_2"].size()), []) self.parent.assertListEqual( - list(result["loss_2"].size()), - []) - self.parent.assertListEqual( - list(result["all_logits_2"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + list(result["all_logits_2"].size()), [self.batch_size, self.seq_length, self.vocab_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_2"]), - [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) - def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels): + def create_and_check_xlnet_qa( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ): model = XLNetForQuestionAnswering(config) model.to(torch_device) model.eval() @@ -227,21 +312,26 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): outputs = model(input_ids_1) start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs - outputs = model(input_ids_1, start_positions=sequence_labels, - end_positions=sequence_labels, - cls_index=sequence_labels, - is_impossible=is_impossible_labels, - p_mask=input_mask) + outputs = model( + input_ids_1, + start_positions=sequence_labels, + end_positions=sequence_labels, + cls_index=sequence_labels, + is_impossible=is_impossible_labels, + p_mask=input_mask, + ) - outputs = model(input_ids_1, start_positions=sequence_labels, - end_positions=sequence_labels, - cls_index=sequence_labels, - is_impossible=is_impossible_labels) + outputs = model( + input_ids_1, + start_positions=sequence_labels, + end_positions=sequence_labels, + cls_index=sequence_labels, + is_impossible=is_impossible_labels, + ) total_loss, mems = outputs - outputs = model(input_ids_1, start_positions=sequence_labels, - end_positions=sequence_labels) + outputs = model(input_ids_1, start_positions=sequence_labels, end_positions=sequence_labels) total_loss, mems = outputs @@ -255,30 +345,42 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): "mems": mems, } + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) + list(result["start_top_log_probs"].size()), [self.batch_size, model.config.start_n_top] + ) self.parent.assertListEqual( - list(result["start_top_log_probs"].size()), - [self.batch_size, model.config.start_n_top]) - self.parent.assertListEqual( - list(result["start_top_index"].size()), - [self.batch_size, model.config.start_n_top]) + list(result["start_top_index"].size()), [self.batch_size, model.config.start_n_top] + ) self.parent.assertListEqual( list(result["end_top_log_probs"].size()), - [self.batch_size, model.config.start_n_top * model.config.end_n_top]) + [self.batch_size, model.config.start_n_top * model.config.end_n_top], + ) self.parent.assertListEqual( list(result["end_top_index"].size()), - [self.batch_size, model.config.start_n_top * model.config.end_n_top]) - self.parent.assertListEqual( - list(result["cls_logits"].size()), - [self.batch_size]) + [self.batch_size, model.config.start_n_top * model.config.end_n_top], + ) + self.parent.assertListEqual(list(result["cls_logits"].size()), [self.batch_size]) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) - def create_and_check_xlnet_token_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels): + def create_and_check_xlnet_token_classif( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ): model = XLNetForTokenClassification(config) model.to(torch_device) model.eval() @@ -292,26 +394,48 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): "logits": logits, } + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.seq_length, self.type_sequence_label_size]) + list(result["logits"].size()), [self.batch_size, self.seq_length, self.type_sequence_label_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, - sequence_labels, is_impossible_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids_1} + ( + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids_1} return config, inputs_dict - def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels): + def create_and_check_xlnet_sequence_classif( + self, + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ): model = XLNetForSequenceClassification(config) model.to(torch_device) model.eval() @@ -325,25 +449,34 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): "logits": logits, } + self.parent.assertListEqual(list(result["loss"].size()), []) self.parent.assertListEqual( - list(result["loss"].size()), - []) - self.parent.assertListEqual( - list(result["logits"].size()), - [self.batch_size, self.type_sequence_label_size]) + list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size] + ) self.parent.assertListEqual( list(list(mem.size()) for mem in result["mems_1"]), - [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers, + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, - target_mapping, segment_ids, lm_labels, - sequence_labels, is_impossible_labels, token_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids_1} + ( + config, + input_ids_1, + input_ids_2, + input_ids_q, + perm_mask, + input_mask, + target_mapping, + segment_ids, + lm_labels, + sequence_labels, + is_impossible_labels, + token_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids_1} return config, inputs_dict - def setUp(self): self.model_tester = XLNetModelTest.XLNetModelTester(self) self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37) diff --git a/transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py index cc10ad5908..0addcde1d8 100644 --- a/transformers/tests/optimization_test.py +++ b/transformers/tests/optimization_test.py @@ -24,12 +24,14 @@ from transformers import is_torch_available if is_torch_available(): import torch - from transformers import (AdamW, - get_constant_schedule, - get_constant_schedule_with_warmup, - get_cosine_schedule_with_warmup, - get_cosine_with_hard_restarts_schedule_with_warmup, - get_linear_schedule_with_warmup) + from transformers import ( + AdamW, + get_constant_schedule, + get_constant_schedule_with_warmup, + get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, + get_linear_schedule_with_warmup, + ) from .tokenization_tests_commons import TemporaryDirectory from .utils import require_torch @@ -42,6 +44,7 @@ def unwrap_schedule(scheduler, num_steps=10): lrs.append(scheduler.get_lr()) return lrs + def unwrap_and_save_reload_schedule(scheduler, num_steps=10): lrs = [] for step in range(num_steps): @@ -49,16 +52,16 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10): lrs.append(scheduler.get_lr()) if step == num_steps // 2: with TemporaryDirectory() as tmpdirname: - file_name = os.path.join(tmpdirname, 'schedule.bin') + file_name = os.path.join(tmpdirname, "schedule.bin") torch.save(scheduler.state_dict(), file_name) state_dict = torch.load(file_name) scheduler.load_state_dict(state_dict) return lrs + @require_torch class OptimizationTest(unittest.TestCase): - def assertListAlmostEqual(self, list1, list2, tol): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): @@ -74,7 +77,7 @@ class OptimizationTest(unittest.TestCase): loss = criterion(w, target) loss.backward() optimizer.step() - w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. + w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. w.grad.zero_() self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2) @@ -82,7 +85,7 @@ class OptimizationTest(unittest.TestCase): @require_torch class ScheduleInitTest(unittest.TestCase): m = torch.nn.Linear(50, 50) if is_torch_available() else None - optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None + optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None num_steps = 10 def assertListAlmostEqual(self, list1, list2, tol): @@ -93,7 +96,7 @@ class ScheduleInitTest(unittest.TestCase): def test_constant_scheduler(self): scheduler = get_constant_schedule(self.optimizer) lrs = unwrap_schedule(scheduler, self.num_steps) - expected_learning_rates = [10.] * self.num_steps + expected_learning_rates = [10.0] * self.num_steps self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) @@ -135,13 +138,17 @@ class ScheduleInitTest(unittest.TestCase): self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_cosine_hard_restart_scheduler(self): - scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10) + scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( + self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10 + ) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0] self.assertEqual(len(lrs[0]), 1) self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) - scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10) + scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( + self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10 + ) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) diff --git a/transformers/tests/optimization_tf_test.py b/transformers/tests/optimization_tf_test.py index 515d12a158..e88ee971e4 100644 --- a/transformers/tests/optimization_tf_test.py +++ b/transformers/tests/optimization_tf_test.py @@ -12,7 +12,7 @@ if is_tf_available(): import tensorflow as tf from tensorflow.python.eager import context from tensorflow.python.framework import ops - from transformers import (create_optimizer, GradientAccumulator) + from transformers import create_optimizer, GradientAccumulator @require_tf @@ -21,7 +21,7 @@ class OptimizationFTest(unittest.TestCase): self.assertEqual(len(list1), len(list2)) for a, b in zip(list1, list2): self.assertAlmostEqual(a, b, delta=tol) - + def testGradientAccumulator(self): accumulator = GradientAccumulator() accumulator([tf.constant([1.0, 2.0])]) @@ -42,8 +42,8 @@ class OptimizationFTest(unittest.TestCase): physical_devices = tf.config.experimental.list_physical_devices("CPU") tf.config.experimental.set_virtual_device_configuration( physical_devices[0], - [tf.config.experimental.VirtualDeviceConfiguration(), - tf.config.experimental.VirtualDeviceConfiguration()]) + [tf.config.experimental.VirtualDeviceConfiguration(), tf.config.experimental.VirtualDeviceConfiguration()], + ) devices = tf.config.experimental.list_logical_devices(device_type="CPU") strategy = tf.distribute.MirroredStrategy(devices=[device.name for device in devices]) @@ -87,4 +87,4 @@ class OptimizationFTest(unittest.TestCase): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py index 08a1507770..3c258594d1 100644 --- a/transformers/tests/pipelines_test.py +++ b/transformers/tests/pipelines_test.py @@ -6,58 +6,58 @@ from transformers import pipeline from transformers.tests.utils import require_tf, require_torch QA_FINETUNED_MODELS = { - ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None), - ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None), - ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None) + ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None), + ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None), + ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None), } TF_QA_FINETUNED_MODELS = { - ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None), - ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None), - ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None) + ("bert-base-uncased", "bert-large-uncased-whole-word-masking-finetuned-squad", None), + ("bert-base-cased", "bert-large-cased-whole-word-masking-finetuned-squad", None), + ("bert-base-uncased", "distilbert-base-uncased-distilled-squad", None), } TF_NER_FINETUNED_MODELS = { ( - 'bert-base-cased', - 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5', - 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json' + "bert-base-cased", + "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5", + "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json", ) } NER_FINETUNED_MODELS = { ( - 'bert-base-cased', - 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin', - 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json' + "bert-base-cased", + "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin", + "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json", ) } FEATURE_EXTRACT_FINETUNED_MODELS = { - ('bert-base-cased', 'bert-base-cased', None), - # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 - ('distilbert-base-uncased', 'distilbert-base-uncased', None) + ("bert-base-cased", "bert-base-cased", None), + # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 + ("distilbert-base-uncased", "distilbert-base-uncased", None), } TF_FEATURE_EXTRACT_FINETUNED_MODELS = { - ('bert-base-cased', 'bert-base-cased', None), - # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 - ('distilbert-base-uncased', 'distilbert-base-uncased', None) + ("bert-base-cased", "bert-base-cased", None), + # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 + ("distilbert-base-uncased", "distilbert-base-uncased", None), } TF_TEXT_CLASSIF_FINETUNED_MODELS = { ( - 'bert-base-uncased', - 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5', - 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json' + "bert-base-uncased", + "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5", + "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", ) } TEXT_CLASSIF_FINETUNED_MODELS = { ( - 'bert-base-uncased', - 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin', - 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json' + "bert-base-uncased", + "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin", + "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json", ) } @@ -91,54 +91,54 @@ class MonoColumnInputTestCase(unittest.TestCase): @require_torch def test_ner(self): - mandatory_keys = {'entity', 'word', 'score'} - valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + mandatory_keys = {"entity", "word", "score"} + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in NER_FINETUNED_MODELS: - nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) @require_tf def test_tf_ner(self): - mandatory_keys = {'entity', 'word', 'score'} - valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + mandatory_keys = {"entity", "word", "score"} + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in TF_NER_FINETUNED_MODELS: - nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="ner", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) @require_torch def test_sentiment_analysis(self): - mandatory_keys = {'label'} - valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + mandatory_keys = {"label"} + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS: - nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) @require_tf def test_tf_sentiment_analysis(self): - mandatory_keys = {'label'} - valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + mandatory_keys = {"label"} + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS: - nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) @require_torch def test_features_extraction(self): - valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS: - nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) @require_tf def test_tf_features_extraction(self): - valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] invalid_inputs = [None] for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS: - nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) @@ -165,46 +165,46 @@ class MultiColumnInputTestCase(unittest.TestCase): @require_torch def test_question_answering(self): - mandatory_output_keys = {'score', 'answer', 'start', 'end'} + mandatory_output_keys = {"score", "answer", "start", "end"} valid_samples = [ - {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'}, + {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}, { - 'question': 'In what field is HuggingFace working ?', - 'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.' - } + "question": "In what field is HuggingFace working ?", + "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.", + }, ] invalid_samples = [ - {'question': '', 'context': 'This is a test to try empty question edge case'}, - {'question': None, 'context': 'This is a test to try empty question edge case'}, - {'question': 'What is does with empty context ?', 'context': ''}, - {'question': 'What is does with empty context ?', 'context': None}, + {"question": "", "context": "This is a test to try empty question edge case"}, + {"question": None, "context": "This is a test to try empty question edge case"}, + {"question": "What is does with empty context ?", "context": ""}, + {"question": "What is does with empty context ?", "context": None}, ] for tokenizer, model, config in QA_FINETUNED_MODELS: - nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer) self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) @require_tf def test_tf_question_answering(self): - mandatory_output_keys = {'score', 'answer', 'start', 'end'} + mandatory_output_keys = {"score", "answer", "start", "end"} valid_samples = [ - {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'}, + {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."}, { - 'question': 'In what field is HuggingFace working ?', - 'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.' - } + "question": "In what field is HuggingFace working ?", + "context": "HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.", + }, ] invalid_samples = [ - {'question': '', 'context': 'This is a test to try empty question edge case'}, - {'question': None, 'context': 'This is a test to try empty question edge case'}, - {'question': 'What is does with empty context ?', 'context': ''}, - {'question': 'What is does with empty context ?', 'context': None}, + {"question": "", "context": "This is a test to try empty question edge case"}, + {"question": None, "context": "This is a test to try empty question edge case"}, + {"question": "What is does with empty context ?", "context": ""}, + {"question": "What is does with empty context ?", "context": None}, ] for tokenizer, model, config in TF_QA_FINETUNED_MODELS: - nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) + nlp = pipeline(task="question-answering", model=model, config=config, tokenizer=tokenizer) self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_albert_test.py b/transformers/tests/tokenization_albert_test.py index 59eb3bceb0..7d7e793b54 100644 --- a/transformers/tests/tokenization_albert_test.py +++ b/transformers/tests/tokenization_albert_test.py @@ -17,12 +17,12 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest -from transformers.tokenization_albert import (AlbertTokenizer, SPIECE_UNDERLINE) +from transformers.tokenization_albert import AlbertTokenizer, SPIECE_UNDERLINE from .tokenization_tests_commons import CommonTestCases -SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), - 'fixtures/spiece.model') +SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model") + class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester): @@ -39,27 +39,30 @@ class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester): return AlbertTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"this is a test" - output_text = u"this is a test" + input_text = "this is a test" + output_text = "this is a test" return input_text, output_text - def test_full_tokenizer(self): tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokens = tokenizer.tokenize(u'This is a test') - self.assertListEqual(tokens, [u'▁this', u'▁is', u'▁a', u'▁test']) + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289]) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( - tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289]) - - tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") - self.assertListEqual(tokens, [u'▁i', u'▁was', u'▁born', u'▁in', u'▁9', u'2000', u',', u'▁and', u'▁this', u'▁is', u'▁fal', u's', u'é', u'.']) + tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."] + ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9]) back_tokens = tokenizer.convert_ids_to_tokens(ids) - self.assertListEqual(back_tokens, ['▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', '', '.']) + self.assertListEqual( + back_tokens, + ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "."], + ) def test_sequence_builders(self): tokenizer = AlbertTokenizer(SAMPLE_VOCAB) @@ -71,8 +74,10 @@ class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester): encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] - assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id] + assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ + tokenizer.sep_token_id + ] -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py index 0a894cac04..7d77bf5b23 100644 --- a/transformers/tests/tokenization_auto_test.py +++ b/transformers/tests/tokenization_auto_test.py @@ -48,5 +48,6 @@ class AutoTokenizerTest(unittest.TestCase): self.assertIsInstance(tokenizer, BertTokenizer) self.assertEqual(len(tokenizer), 12) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py index 545193c7cc..02eb8c0a66 100644 --- a/transformers/tests/tokenization_bert_japanese_test.py +++ b/transformers/tests/tokenization_bert_japanese_test.py @@ -19,9 +19,12 @@ import unittest from io import open from transformers.tokenization_bert import WordpieceTokenizer -from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer, - MecabTokenizer, CharacterTokenizer, - VOCAB_FILES_NAMES) +from transformers.tokenization_bert_japanese import ( + BertJapaneseTokenizer, + MecabTokenizer, + CharacterTokenizer, + VOCAB_FILES_NAMES, +) from .tokenization_tests_commons import CommonTestCases from .utils import slow, custom_tokenizers @@ -35,9 +38,24 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): def setUp(self): super(BertJapaneseTokenizationTest, self).setUp() - vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", - u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは", - u"世界", u"##世界", u"、", u"##、", u"。", u"##。"] + vocab_tokens = [ + "[UNK]", + "[CLS]", + "[SEP]", + "こんにちは", + "こん", + "にちは", + "ばんは", + "##こん", + "##にちは", + "##ばんは", + "世界", + "##世界", + "、", + "##、", + "。", + "##。", + ] self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: @@ -47,70 +65,63 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"こんにちは、世界。 \nこんばんは、世界。" - output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。" + input_text = "こんにちは、世界。 \nこんばんは、世界。" + output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。" return input_text, output_text def test_full_tokenizer(self): tokenizer = self.tokenizer_class(self.vocab_file) - tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。") - self.assertListEqual(tokens, - [u"こんにちは", u"、", u"世界", u"。", - u"こん", u"##ばんは", u"、", u"世界", "。"]) - self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), - [3, 12, 10, 14, 4, 9, 12, 10, 14]) + tokens = tokenizer.tokenize("こんにちは、世界。\nこんばんは、世界。") + self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) def test_mecab_tokenizer(self): tokenizer = MecabTokenizer() self.assertListEqual( - tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "), - [u"アップルストア", u"で", u"iPhone", u"8", u"が", - u"発売", u"さ", u"れ", u"た", u"。"]) + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"], + ) def test_mecab_tokenizer_lower(self): tokenizer = MecabTokenizer(do_lower_case=True) self.assertListEqual( - tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "), - [u"アップルストア", u"で", u"iphone", u"8", u"が", - u"発売", u"さ", u"れ", u"た", u"。"]) + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + ["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"], + ) def test_mecab_tokenizer_no_normalize(self): tokenizer = MecabTokenizer(normalize_text=False) self.assertListEqual( - tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "), - [u"アップルストア", u"で", u"iPhone", u"8", u"が", - u"発売", u"さ", u"れ", u"た", u" ", u"。"]) + tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), + ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"], + ) def test_wordpiece_tokenizer(self): - vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", - u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"] + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こんにちは", "こん", "にちは" "ばんは", "##こん", "##にちは", "##ばんは"] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i - tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]") + tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") - self.assertListEqual(tokenizer.tokenize(u""), []) + self.assertListEqual(tokenizer.tokenize(""), []) - self.assertListEqual(tokenizer.tokenize(u"こんにちは"), - [u"こんにちは"]) + self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こんにちは"]) - self.assertListEqual(tokenizer.tokenize(u"こんばんは"), - [u"こん", u"##ばんは"]) + self.assertListEqual(tokenizer.tokenize("こんばんは"), ["こん", "##ばんは"]) - self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"), - [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"]) + self.assertListEqual(tokenizer.tokenize("こんばんは こんばんにちは こんにちは"), ["こん", "##ばんは", "[UNK]", "こんにちは"]) @slow def test_sequence_builders(self): tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese") - text = tokenizer.encode(u"ありがとう。", add_special_tokens=False) - text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False) + text = tokenizer.encode("ありがとう。", add_special_tokens=False) + text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) @@ -127,58 +138,51 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste def setUp(self): super(BertJapaneseCharacterTokenizationTest, self).setUp() - vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", - u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"] + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"] self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_tokenizer(self, **kwargs): - return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, - subword_tokenizer_type="character", - **kwargs) + return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs) def get_input_output_texts(self): - input_text = u"こんにちは、世界。 \nこんばんは、世界。" - output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。" + input_text = "こんにちは、世界。 \nこんばんは、世界。" + output_text = "こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。" return input_text, output_text def test_full_tokenizer(self): - tokenizer = self.tokenizer_class(self.vocab_file, - subword_tokenizer_type="character") + tokenizer = self.tokenizer_class(self.vocab_file, subword_tokenizer_type="character") - tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。") - self.assertListEqual(tokens, - [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。", - u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"]) - self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), - [3, 4, 5, 6, 7, 11, 9, 10, 12, - 3, 4, 8, 4, 7, 11, 9, 10, 12]) + tokens = tokenizer.tokenize("こんにちは、世界。 \nこんばんは、世界。") + self.assertListEqual( + tokens, ["こ", "ん", "に", "ち", "は", "、", "世", "界", "。", "こ", "ん", "ば", "ん", "は", "、", "世", "界", "。"] + ) + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), [3, 4, 5, 6, 7, 11, 9, 10, 12, 3, 4, 8, 4, 7, 11, 9, 10, 12] + ) def test_character_tokenizer(self): - vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", - u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"] + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界" "、", "。"] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i - tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]") + tokenizer = CharacterTokenizer(vocab=vocab, unk_token="[UNK]") - self.assertListEqual(tokenizer.tokenize(u""), []) + self.assertListEqual(tokenizer.tokenize(""), []) - self.assertListEqual(tokenizer.tokenize(u"こんにちは"), - [u"こ", u"ん", u"に", u"ち", u"は"]) + self.assertListEqual(tokenizer.tokenize("こんにちは"), ["こ", "ん", "に", "ち", "は"]) - self.assertListEqual(tokenizer.tokenize(u"こんにちほ"), - [u"こ", u"ん", u"に", u"ち", u"[UNK]"]) + self.assertListEqual(tokenizer.tokenize("こんにちほ"), ["こ", "ん", "に", "ち", "[UNK]"]) @slow def test_sequence_builders(self): tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char") - text = tokenizer.encode(u"ありがとう。", add_special_tokens=False) - text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False) + text = tokenizer.encode("ありがとう。", add_special_tokens=False) + text_2 = tokenizer.encode("どういたしまして。", add_special_tokens=False) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) @@ -186,6 +190,3 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste # 2 is for "[CLS]", 3 is for "[SEP]" assert encoded_sentence == [2] + text + [3] assert encoded_pair == [2] + text + [3] + text_2 + [3] - - - diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index c503ea5e1e..bf023761a6 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -18,15 +18,20 @@ import os import unittest from io import open -from transformers.tokenization_bert import (BasicTokenizer, - BertTokenizer, - WordpieceTokenizer, - _is_control, _is_punctuation, - _is_whitespace, VOCAB_FILES_NAMES) +from transformers.tokenization_bert import ( + BasicTokenizer, + BertTokenizer, + WordpieceTokenizer, + _is_control, + _is_punctuation, + _is_whitespace, + VOCAB_FILES_NAMES, +) from .tokenization_tests_commons import CommonTestCases from .utils import slow + class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer_class = BertTokenizer @@ -35,55 +40,61 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): super(BertTokenizationTest, self).setUp() vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing", ",", "low", "lowest", + "[UNK]", + "[CLS]", + "[SEP]", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_tokenizer(self, **kwargs): return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"UNwant\u00E9d,running" - output_text = u"unwanted, running" + input_text = "UNwant\u00E9d,running" + output_text = "unwanted, running" return input_text, output_text def test_full_tokenizer(self): tokenizer = self.tokenizer_class(self.vocab_file) - tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + tokens = tokenizer.tokenize("UNwant\u00E9d,running") self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) def test_chinese(self): tokenizer = BasicTokenizer() - self.assertListEqual( - tokenizer.tokenize(u"ah\u535A\u63A8zz"), - [u"ah", u"\u535A", u"\u63A8", u"zz"]) + self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"]) def test_basic_tokenizer_lower(self): tokenizer = BasicTokenizer(do_lower_case=True) self.assertListEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), - ["hello", "!", "how", "are", "you", "?"]) - self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["hello", "!", "how", "are", "you", "?"] + ) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"]) def test_basic_tokenizer_no_lower(self): tokenizer = BasicTokenizer(do_lower_case=False) self.assertListEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), - ["HeLLo", "!", "how", "Are", "yoU", "?"]) + tokenizer.tokenize(" \tHeLLo!how \n Are yoU? "), ["HeLLo", "!", "how", "Are", "yoU", "?"] + ) def test_wordpiece_tokenizer(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing" - ] + vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"] vocab = {} for (i, token) in enumerate(vocab_tokens): @@ -92,39 +103,36 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): self.assertListEqual(tokenizer.tokenize(""), []) - self.assertListEqual( - tokenizer.tokenize("unwanted running"), - ["un", "##want", "##ed", "runn", "##ing"]) + self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"]) - self.assertListEqual( - tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) def test_is_whitespace(self): - self.assertTrue(_is_whitespace(u" ")) - self.assertTrue(_is_whitespace(u"\t")) - self.assertTrue(_is_whitespace(u"\r")) - self.assertTrue(_is_whitespace(u"\n")) - self.assertTrue(_is_whitespace(u"\u00A0")) + self.assertTrue(_is_whitespace(" ")) + self.assertTrue(_is_whitespace("\t")) + self.assertTrue(_is_whitespace("\r")) + self.assertTrue(_is_whitespace("\n")) + self.assertTrue(_is_whitespace("\u00A0")) - self.assertFalse(_is_whitespace(u"A")) - self.assertFalse(_is_whitespace(u"-")) + self.assertFalse(_is_whitespace("A")) + self.assertFalse(_is_whitespace("-")) def test_is_control(self): - self.assertTrue(_is_control(u"\u0005")) + self.assertTrue(_is_control("\u0005")) - self.assertFalse(_is_control(u"A")) - self.assertFalse(_is_control(u" ")) - self.assertFalse(_is_control(u"\t")) - self.assertFalse(_is_control(u"\r")) + self.assertFalse(_is_control("A")) + self.assertFalse(_is_control(" ")) + self.assertFalse(_is_control("\t")) + self.assertFalse(_is_control("\r")) def test_is_punctuation(self): - self.assertTrue(_is_punctuation(u"-")) - self.assertTrue(_is_punctuation(u"$")) - self.assertTrue(_is_punctuation(u"`")) - self.assertTrue(_is_punctuation(u".")) + self.assertTrue(_is_punctuation("-")) + self.assertTrue(_is_punctuation("$")) + self.assertTrue(_is_punctuation("`")) + self.assertTrue(_is_punctuation(".")) - self.assertFalse(_is_punctuation(u"A")) - self.assertFalse(_is_punctuation(u" ")) + self.assertFalse(_is_punctuation("A")) + self.assertFalse(_is_punctuation(" ")) @slow def test_sequence_builders(self): @@ -140,5 +148,5 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): assert encoded_pair == [101] + text + [102] + text_2 + [102] -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_ctrl_test.py b/transformers/tests/tokenization_ctrl_test.py index ad16cf07fa..04c9dec523 100644 --- a/transformers/tests/tokenization_ctrl_test.py +++ b/transformers/tests/tokenization_ctrl_test.py @@ -22,6 +22,7 @@ from transformers.tokenization_ctrl import CTRLTokenizer, VOCAB_FILES_NAMES from .tokenization_tests_commons import CommonTestCases + class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer_class = CTRLTokenizer @@ -30,13 +31,13 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): super(CTRLTokenizationTest, self).setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt - vocab = ['adapt', 're@@', 'a@@', 'apt', 'c@@', 't', ''] + vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - merges = ["#version: 0.2", 'a p', 'ap t', 'r e', 'a d', 'ad apt', ''] + merges = ["#version: 0.2", "a p", "ap t", "r e", "a d", "ad apt", ""] self.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) with open(self.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") with open(self.merges_file, "w", encoding="utf-8") as fp: @@ -47,23 +48,22 @@ class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"adapt react readapt apt" - output_text = u"adapt react readapt apt" + input_text = "adapt react readapt apt" + output_text = "adapt react readapt apt" return input_text, output_text def test_full_tokenizer(self): tokenizer = CTRLTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) text = "adapt react readapt apt" - bpe_tokens = 'adapt re@@ a@@ c@@ t re@@ adapt apt'.split() + bpe_tokens = "adapt re@@ a@@ c@@ t re@@ adapt apt".split() tokens = tokenizer.tokenize(text) self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens + [tokenizer.unk_token] input_bpe_tokens = [0, 1, 2, 4, 5, 1, 0, 3, 6] - self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py index e815eca672..551f9e188e 100644 --- a/transformers/tests/tokenization_distilbert_test.py +++ b/transformers/tests/tokenization_distilbert_test.py @@ -18,12 +18,13 @@ import os import unittest from io import open -from transformers.tokenization_distilbert import (DistilBertTokenizer) +from transformers.tokenization_distilbert import DistilBertTokenizer from .tokenization_tests_commons import CommonTestCases from .tokenization_bert_test import BertTokenizationTest from .utils import slow + class DistilBertTokenizationTest(BertTokenizationTest): tokenizer_class = DistilBertTokenizer @@ -42,9 +43,10 @@ class DistilBertTokenizationTest(BertTokenizationTest): encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] - assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \ - text_2 + [tokenizer.sep_token_id] + assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ + tokenizer.sep_token_id + ] -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py index 5eae767bdf..552b73416e 100644 --- a/transformers/tests/tokenization_gpt2_test.py +++ b/transformers/tests/tokenization_gpt2_test.py @@ -23,6 +23,7 @@ from transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES from .tokenization_tests_commons import CommonTestCases + class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer_class = GPT2Tokenizer @@ -31,16 +32,34 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): super(GPT2TokenizationTest, self).setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt - vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", - "\u0120", "\u0120l", "\u0120n", - "\u0120lo", "\u0120low", "er", - "\u0120lowest", "\u0120newer", "\u0120wider", ""] + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", + ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] self.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) with open(self.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") with open(self.merges_file, "w", encoding="utf-8") as fp: @@ -51,8 +70,8 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"lower newer" - output_text = u"lower newer" + input_text = "lower newer" + output_text = "lower newer" return input_text, output_text def test_full_tokenizer(self): @@ -64,8 +83,8 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): input_tokens = tokens + [tokenizer.unk_token] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] - self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_openai_test.py b/transformers/tests/tokenization_openai_test.py index 56aa219ddc..c6a802b7be 100644 --- a/transformers/tests/tokenization_openai_test.py +++ b/transformers/tests/tokenization_openai_test.py @@ -31,15 +31,34 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): super(OpenAIGPTTokenizationTest, self).setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt - vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", - "w", "r", "t", - "lo", "low", "er", - "low", "lowest", "newer", "wider", ""] + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "w", + "r", + "t", + "lo", + "low", + "er", + "low", + "lowest", + "newer", + "wider", + "", + ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) with open(self.vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) with open(self.merges_file, "w") as fp: @@ -49,11 +68,10 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"lower newer" - output_text = u"lower newer" + input_text = "lower newer" + output_text = "lower newer" return input_text, output_text - def test_full_tokenizer(self): tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file) @@ -64,9 +82,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): input_tokens = tokens + [""] input_bpe_tokens = [14, 15, 20] - self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py index 8ad0b59511..a1d9d5fb72 100644 --- a/transformers/tests/tokenization_roberta_test.py +++ b/transformers/tests/tokenization_roberta_test.py @@ -31,16 +31,34 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): super(RobertaTokenizationTest, self).setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt - vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", - "\u0120", "\u0120l", "\u0120n", - "\u0120lo", "\u0120low", "er", - "\u0120lowest", "\u0120newer", "\u0120wider", ""] + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "\u0120", + "\u0120l", + "\u0120n", + "\u0120lo", + "\u0120low", + "er", + "\u0120lowest", + "\u0120newer", + "\u0120wider", + "", + ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] self.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) with open(self.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") with open(self.merges_file, "w", encoding="utf-8") as fp: @@ -51,8 +69,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"lower newer" - output_text = u"lower newer" + input_text = "lower newer" + output_text = "lower newer" return input_text, output_text def test_full_tokenizer(self): @@ -64,19 +82,15 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): input_tokens = tokens + [tokenizer.unk_token] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] - self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) def roberta_dict_integration_testing(self): tokenizer = self.get_tokenizer() + self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2]) self.assertListEqual( - tokenizer.encode('Hello world!', add_special_tokens=False), - [0, 31414, 232, 328, 2] - ) - self.assertListEqual( - tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False), - [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2] + tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False), + [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], ) @slow @@ -87,7 +101,9 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) - encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True) + encoded_pair_from_decode = tokenizer.encode( + "sequence builders", "multi-sequence build", add_special_tokens=True + ) encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) @@ -96,5 +112,5 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): assert encoded_pair == encoded_pair_from_decode -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py index 0b4f960e32..09bc0267f1 100644 --- a/transformers/tests/tokenization_t5_test.py +++ b/transformers/tests/tokenization_t5_test.py @@ -17,13 +17,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest -from transformers.tokenization_t5 import (T5Tokenizer) +from transformers.tokenization_t5 import T5Tokenizer from transformers.tokenization_xlnet import SPIECE_UNDERLINE from .tokenization_tests_commons import CommonTestCases -SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), - 'fixtures/test_sentencepiece.model') +SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") + class T5TokenizationTest(CommonTestCases.CommonTokenizerTester): @@ -40,38 +40,76 @@ class T5TokenizationTest(CommonTestCases.CommonTokenizerTester): return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"This is a test" - output_text = u"This is a test" + input_text = "This is a test" + output_text = "This is a test" return input_text, output_text def test_full_tokenizer(self): tokenizer = T5Tokenizer(SAMPLE_VOCAB) - tokens = tokenizer.tokenize(u'This is a test') - self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( - tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) - - tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") - self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', - u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', - u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', - SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.']) + tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "é", + ".", + ], + ) ids = tokenizer.convert_tokens_to_ids(tokens) - self.assertListEqual( - ids, [8, 21, 84, 55, 24, 19, 7, 0, - 602, 347, 347, 347, 3, 12, 66, - 46, 72, 80, 6, 0, 4]) + self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4]) back_tokens = tokenizer.convert_ids_to_tokens(ids) - self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', - u'or', u'n', SPIECE_UNDERLINE + u'in', - SPIECE_UNDERLINE + u'', u'', u'2', u'0', u'0', u'0', u',', - SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', - SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', - u'', u'.']) + self.assertListEqual( + back_tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "", + ".", + ], + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index c417d033dc..ba81101084 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -26,19 +26,23 @@ if sys.version_info[0] == 2: class TemporaryDirectory(object): """Context manager for tempfile.mkdtemp() so it's usable with "with" statement.""" + def __enter__(self): self.name = tempfile.mkdtemp() return self.name + def __exit__(self, exc_type, exc_value, traceback): shutil.rmtree(self.name) + + else: import pickle + TemporaryDirectory = tempfile.TemporaryDirectory unicode = str class CommonTestCases: - class CommonTokenizerTester(unittest.TestCase): tokenizer_class = None @@ -57,17 +61,23 @@ class CommonTestCases: def test_tokenizers_common_properties(self): tokenizer = self.get_tokenizer() - attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token", - "pad_token", "cls_token", "mask_token"] + attributes_list = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + ] for attr in attributes_list: self.assertTrue(hasattr(tokenizer, attr)) self.assertTrue(hasattr(tokenizer, attr + "_id")) self.assertTrue(hasattr(tokenizer, "additional_special_tokens")) - self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids')) + self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids")) - attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", - "added_tokens_decoder"] + attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"] for attr in attributes_list: self.assertTrue(hasattr(tokenizer, attr)) @@ -79,13 +89,13 @@ class CommonTestCases: # Now let's start the test tokenizer = self.get_tokenizer(max_len=42) - before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False) + before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) with TemporaryDirectory() as tmpdirname: tokenizer.save_pretrained(tmpdirname) tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) - after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False) + after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) self.assertListEqual(before_tokens, after_tokens) self.assertEqual(tokenizer.max_len, 42) @@ -96,12 +106,12 @@ class CommonTestCases: tokenizer = self.get_tokenizer() self.assertIsNotNone(tokenizer) - text = u"Munich and Berlin are nice cities" + text = "Munich and Berlin are nice cities" subwords = tokenizer.tokenize(text) with TemporaryDirectory() as tmpdirname: - filename = os.path.join(tmpdirname, u"tokenizer.bin") + filename = os.path.join(tmpdirname, "tokenizer.bin") with open(filename, "wb") as handle: pickle.dump(tokenizer, handle) @@ -122,7 +132,7 @@ class CommonTestCases: toks0 = tokenizer.tokenize(text) # toks before adding new_toks - new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", 'AAAAA BBBBBB', 'CCCCCCCCCDDDDDDDD'] + new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] added = tokenizer.add_tokens(new_toks) self.assertEqual(added, 2) @@ -178,8 +188,7 @@ class CommonTestCases: self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) - new_toks_2 = {'eos_token': ">>>>|||<||<<|<<", - 'pad_token': "<<<<<|||>|>>>>|>"} + new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} added_toks_2 = tokenizer.add_special_tokens(new_toks_2) vocab_size_3 = tokenizer.vocab_size all_size_3 = len(tokenizer) @@ -189,8 +198,9 @@ class CommonTestCases: self.assertEqual(added_toks_2, len(new_toks_2)) self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) - tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", - add_special_tokens=False) + tokens = tokenizer.encode( + ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False + ) out_string = tokenizer.decode(tokens) self.assertGreaterEqual(len(tokens), 6) @@ -242,7 +252,7 @@ class CommonTestCases: def test_encode_decode_with_spaces(self): tokenizer = self.get_tokenizer() - new_toks = ['[ABC]', '[DEF]', 'GHI IHG'] + new_toks = ["[ABC]", "[DEF]", "GHI IHG"] tokenizer.add_tokens(new_toks) input = "[ABC] [DEF] [ABC] GHI IHG [DEF]" encoded = tokenizer.encode(input, add_special_tokens=False) @@ -264,7 +274,7 @@ class CommonTestCases: tokenizer = self.get_tokenizer() - if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer": + if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer": seq_0 = "Test this method." seq_1 = "With these inputs." information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True) @@ -293,17 +303,19 @@ class CommonTestCases: sequence = tokenizer.encode(seq_0, add_special_tokens=False) num_added_tokens = tokenizer.num_added_tokens() total_length = len(sequence) + num_added_tokens - information = tokenizer.encode_plus(seq_0, - max_length=total_length - 2, - add_special_tokens=True, - stride=stride, - return_overflowing_tokens=True) + information = tokenizer.encode_plus( + seq_0, + max_length=total_length - 2, + add_special_tokens=True, + stride=stride, + return_overflowing_tokens=True, + ) truncated_sequence = information["input_ids"] overflowing_tokens = information["overflowing_tokens"] self.assertEqual(len(overflowing_tokens), 2 + stride) - self.assertEqual(overflowing_tokens, sequence[-(2 + stride):]) + self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :]) self.assertEqual(len(truncated_sequence), total_length - 2) self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2])) @@ -320,24 +332,35 @@ class CommonTestCases: sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) truncated_second_sequence = tokenizer.build_inputs_with_special_tokens( tokenizer.encode(seq_0, add_special_tokens=False), - tokenizer.encode(seq_1, add_special_tokens=False)[:-2] + tokenizer.encode(seq_1, add_special_tokens=False)[:-2], ) - information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True, - stride=stride, truncation_strategy='only_second', - return_overflowing_tokens=True) - information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, - add_special_tokens=True, stride=stride, - truncation_strategy='only_first', - return_overflowing_tokens=True) + information = tokenizer.encode_plus( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=True, + stride=stride, + truncation_strategy="only_second", + return_overflowing_tokens=True, + ) + information_first_truncated = tokenizer.encode_plus( + seq_0, + seq_1, + max_length=len(sequence) - 2, + add_special_tokens=True, + stride=stride, + truncation_strategy="only_first", + return_overflowing_tokens=True, + ) truncated_sequence = information["input_ids"] overflowing_tokens = information["overflowing_tokens"] overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"] self.assertEqual(len(overflowing_tokens), 2 + stride) - self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):]) - self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):]) + self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :]) + self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :]) self.assertEqual(len(truncated_sequence), len(sequence) - 2) self.assertEqual(truncated_sequence, truncated_second_sequence) @@ -361,37 +384,47 @@ class CommonTestCases: # Testing single inputs encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) - encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True, return_special_tokens_mask=True) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, add_special_tokens=True, return_special_tokens_mask=True + ) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) - filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] filtered_sequence = [x for x in filtered_sequence if x is not None] self.assertEqual(encoded_sequence, filtered_sequence) # Testing inputs pairs - encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1, - add_special_tokens=False) - encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True, - return_special_tokens_mask=True) + encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode( + sequence_1, add_special_tokens=False + ) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True + ) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) - filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)] + filtered_sequence = [ + (x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) + ] filtered_sequence = [x for x in filtered_sequence if x is not None] self.assertEqual(encoded_sequence, filtered_sequence) # Testing with already existing special tokens if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id: - tokenizer.add_special_tokens({'cls_token': '', 'sep_token': ''}) - encoded_sequence_dict = tokenizer.encode_plus(sequence_0, - add_special_tokens=True, - return_special_tokens_mask=True) + tokenizer.add_special_tokens({"cls_token": "", "sep_token": ""}) + encoded_sequence_dict = tokenizer.encode_plus( + sequence_0, add_special_tokens=True, return_special_tokens_mask=True + ) encoded_sequence_w_special = encoded_sequence_dict["input_ids"] special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"] - special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True) + special_tokens_mask = tokenizer.get_special_tokens_mask( + encoded_sequence_w_special, already_has_special_tokens=True + ) self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) self.assertEqual(special_tokens_mask_orig, special_tokens_mask) @@ -406,7 +439,9 @@ class CommonTestCases: tokenizer.padding_side = "right" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence = tokenizer.encode( + sequence, max_length=sequence_length + padding_size, pad_to_max_length=True + ) padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert encoded_sequence + [padding_idx] * padding_size == padded_sequence @@ -415,7 +450,9 @@ class CommonTestCases: tokenizer.padding_side = "left" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence = tokenizer.encode( + sequence, max_length=sequence_length + padding_size, pad_to_max_length=True + ) padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert [padding_idx] * padding_size + encoded_sequence == padded_sequence @@ -446,38 +483,48 @@ class CommonTestCases: token_type_padding_idx = tokenizer.pad_token_type_id encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) - input_ids = encoded_sequence['input_ids'] - token_type_ids = encoded_sequence['token_type_ids'] - attention_mask = encoded_sequence['attention_mask'] - special_tokens_mask = encoded_sequence['special_tokens_mask'] + input_ids = encoded_sequence["input_ids"] + token_type_ids = encoded_sequence["token_type_ids"] + attention_mask = encoded_sequence["attention_mask"] + special_tokens_mask = encoded_sequence["special_tokens_mask"] sequence_length = len(input_ids) # Test right padding tokenizer.padding_side = "right" - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) - padded_input_ids = padded_sequence['input_ids'] - padded_token_type_ids = padded_sequence['token_type_ids'] - padded_attention_mask = padded_sequence['attention_mask'] - padded_special_tokens_mask = padded_sequence['special_tokens_mask'] + padded_sequence = tokenizer.encode_plus( + sequence, + max_length=sequence_length + padding_size, + pad_to_max_length=True, + return_special_tokens_mask=True, + ) + padded_input_ids = padded_sequence["input_ids"] + padded_token_type_ids = padded_sequence["token_type_ids"] + padded_attention_mask = padded_sequence["attention_mask"] + padded_special_tokens_mask = padded_sequence["special_tokens_mask"] padded_sequence_length = len(padded_input_ids) assert sequence_length + padding_size == padded_sequence_length assert input_ids + [padding_idx] * padding_size == padded_input_ids assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids - assert attention_mask + [0] * padding_size == padded_attention_mask - assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask + assert attention_mask + [0] * padding_size == padded_attention_mask + assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask # Test left padding tokenizer.padding_side = "left" - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) - padded_input_ids = padded_sequence['input_ids'] - padded_token_type_ids = padded_sequence['token_type_ids'] - padded_attention_mask = padded_sequence['attention_mask'] - padded_special_tokens_mask = padded_sequence['special_tokens_mask'] + padded_sequence = tokenizer.encode_plus( + sequence, + max_length=sequence_length + padding_size, + pad_to_max_length=True, + return_special_tokens_mask=True, + ) + padded_input_ids = padded_sequence["input_ids"] + padded_token_type_ids = padded_sequence["token_type_ids"] + padded_attention_mask = padded_sequence["attention_mask"] + padded_special_tokens_mask = padded_sequence["special_tokens_mask"] padded_sequence_length = len(padded_input_ids) assert sequence_length + padding_size == padded_sequence_length assert [padding_idx] * padding_size + input_ids == padded_input_ids assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids - assert [0] * padding_size + attention_mask == padded_attention_mask - assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask \ No newline at end of file + assert [0] * padding_size + attention_mask == padded_attention_mask + assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask diff --git a/transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py index 5495ebd3a6..8b737283da 100644 --- a/transformers/tests/tokenization_transfo_xl_test.py +++ b/transformers/tests/tokenization_transfo_xl_test.py @@ -37,45 +37,53 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester): super(TransfoXLTokenizationTest, self).setUp() vocab_tokens = [ - "", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", - "running", ",", "low", "l", + "", + "[CLS]", + "[SEP]", + "want", + "unwanted", + "wa", + "un", + "running", + ",", + "low", + "l", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_tokenizer(self, **kwargs): - kwargs['lower_case'] = True + kwargs["lower_case"] = True return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u" UNwanted , running" - output_text = u" unwanted, running" + input_text = " UNwanted , running" + output_text = " unwanted, running" return input_text, output_text def test_full_tokenizer(self): tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True) - tokens = tokenizer.tokenize(u" UNwanted , running") + tokens = tokenizer.tokenize(" UNwanted , running") self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) - self.assertListEqual( - tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) def test_full_tokenizer_lower(self): tokenizer = TransfoXLTokenizer(lower_case=True) self.assertListEqual( - tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), - ["hello", "!", "how", "are", "you", "?"]) + tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["hello", "!", "how", "are", "you", "?"] + ) def test_full_tokenizer_no_lower(self): tokenizer = TransfoXLTokenizer(lower_case=False) self.assertListEqual( - tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), - ["HeLLo", "!", "how", "Are", "yoU", "?"]) + tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["HeLLo", "!", "how", "Are", "yoU", "?"] + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py index ff3f80ff7d..4fa92c44bf 100644 --- a/transformers/tests/tokenization_utils_test.py +++ b/transformers/tests/tokenization_utils_test.py @@ -24,8 +24,8 @@ from transformers.tokenization_gpt2 import GPT2Tokenizer from .utils import slow -class TokenizerUtilsTest(unittest.TestCase): +class TokenizerUtilsTest(unittest.TestCase): def check_tokenizer_from_pretrained(self, tokenizer_class): s3_models = list(tokenizer_class.max_model_input_sizes.keys()) for model_name in s3_models[:1]: @@ -46,5 +46,6 @@ class TokenizerUtilsTest(unittest.TestCase): def test_pretrained_tokenizers(self): self.check_tokenizer_from_pretrained(GPT2Tokenizer) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py index 7582a46662..e9aa2b7d0e 100644 --- a/transformers/tests/tokenization_xlm_test.py +++ b/transformers/tests/tokenization_xlm_test.py @@ -23,6 +23,7 @@ from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES from .tokenization_tests_commons import CommonTestCases from .utils import slow + class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer_class = XLMTokenizer @@ -31,15 +32,34 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): super(XLMTokenizationTest, self).setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt - vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", - "w", "r", "t", - "lo", "low", "er", - "low", "lowest", "newer", "wider", ""] + vocab = [ + "l", + "o", + "w", + "e", + "r", + "s", + "t", + "i", + "d", + "n", + "w", + "r", + "t", + "lo", + "low", + "er", + "low", + "lowest", + "newer", + "wider", + "", + ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["l o 123", "lo w 1456", "e r 1789", ""] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) with open(self.vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) with open(self.merges_file, "w") as fp: @@ -49,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"lower newer" - output_text = u"lower newer" + input_text = "lower newer" + output_text = "lower newer" return input_text, output_text def test_full_tokenizer(self): @@ -64,8 +84,7 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): input_tokens = tokens + [""] input_bpe_tokens = [14, 15, 20] - self.assertListEqual( - tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) @slow def test_sequence_builders(self): @@ -80,5 +99,6 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): assert encoded_sentence == [1] + text + [1] assert encoded_pair == [1] + text + [1] + text_2 + [1] -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py index b68495a796..32482449a4 100644 --- a/transformers/tests/tokenization_xlnet_test.py +++ b/transformers/tests/tokenization_xlnet_test.py @@ -17,13 +17,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest -from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE) +from transformers.tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_tests_commons import CommonTestCases from .utils import slow -SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), - 'fixtures/test_sentencepiece.model') +SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") + class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): @@ -40,55 +40,135 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self): - input_text = u"This is a test" - output_text = u"This is a test" + input_text = "This is a test" + output_text = "This is a test" return input_text, output_text - def test_full_tokenizer(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokens = tokenizer.tokenize(u'This is a test') - self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( - tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) - - tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") - self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', - u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', - u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', - SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.']) + tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "é", + ".", + ], + ) ids = tokenizer.convert_tokens_to_ids(tokens) - self.assertListEqual( - ids, [8, 21, 84, 55, 24, 19, 7, 0, - 602, 347, 347, 347, 3, 12, 66, - 46, 72, 80, 6, 0, 4]) + self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4]) back_tokens = tokenizer.convert_ids_to_tokens(ids) - self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', - u'or', u'n', SPIECE_UNDERLINE + u'in', - SPIECE_UNDERLINE + u'', u'', u'2', u'0', u'0', u'0', u',', - SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', - SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', - u'', u'.']) + self.assertListEqual( + back_tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "s", + "", + ".", + ], + ) def test_tokenizer_lower(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True) - tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") - self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', - u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', - u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', - SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.']) - self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"]) + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + [ + SPIECE_UNDERLINE + "", + "i", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "se", + ".", + ], + ) + self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"]) def test_tokenizer_no_lower(self): tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False) - tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") - self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', - u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', - u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', - SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.']) + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + [ + SPIECE_UNDERLINE + "I", + SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", + "or", + "n", + SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", + "9", + "2", + "0", + "0", + "0", + ",", + SPIECE_UNDERLINE + "and", + SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", + SPIECE_UNDERLINE + "f", + "al", + "se", + ".", + ], + ) @slow def test_sequence_builders(self): @@ -104,5 +184,5 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): assert encoded_pair == text + [4] + text_2 + [4, 3] -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py index ba0e19f420..aab5e5a8a5 100644 --- a/transformers/tests/utils.py +++ b/transformers/tests/utils.py @@ -27,6 +27,7 @@ def parse_flag_from_env(key, default=False): raise ValueError("If set, {} must be yes or no.".format(key)) return _value + _run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False) _run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False) diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py index 6b92d07218..b03b3ca119 100644 --- a/transformers/tokenization_albert.py +++ b/transformers/tokenization_albert.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for ALBERT model.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals from .tokenization_utils import PreTrainedTokenizer import logging @@ -24,34 +23,34 @@ import os from shutil import copyfile logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'} +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model", - 'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model", - 'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model", - 'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model", - 'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model", - 'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model", - 'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model", - 'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model", + "vocab_file": { + "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model", + "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model", + "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model", + "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model", + "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model", + "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model", + "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model", + "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'albert-base-v1': 512, - 'albert-large-v1': 512, - 'albert-xlarge-v1': 512, - 'albert-xxlarge-v1': 512, - 'albert-base-v2': 512, - 'albert-large-v2': 512, - 'albert-xlarge-v2': 512, - 'albert-xxlarge-v2': 512, + "albert-base-v1": 512, + "albert-large-v1": 512, + "albert-xlarge-v1": 512, + "albert-xxlarge-v1": 512, + "albert-base-v2": 512, + "albert-large-v2": 512, + "albert-xlarge-v2": 512, + "albert-xxlarge-v2": 512, } -SPIECE_UNDERLINE = u'▁' +SPIECE_UNDERLINE = "▁" + class AlbertTokenizer(PreTrainedTokenizer): """ @@ -59,18 +58,36 @@ class AlbertTokenizer(PreTrainedTokenizer): - requires `SentencePiece `_ """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, - do_lower_case=True, remove_space=True, keep_accents=False, - bos_token="[CLS]", eos_token="[SEP]", unk_token="", sep_token="[SEP]", - pad_token="", cls_token="[CLS]", mask_token="[MASK]", **kwargs): - super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, - unk_token=unk_token, sep_token=sep_token, - pad_token=pad_token, cls_token=cls_token, - mask_token=mask_token, **kwargs) + def __init__( + self, + vocab_file, + do_lower_case=True, + remove_space=True, + keep_accents=False, + bos_token="[CLS]", + eos_token="[SEP]", + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs + ): + super(AlbertTokenizer, self).__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens @@ -78,8 +95,10 @@ class AlbertTokenizer(PreTrainedTokenizer): try: import sentencepiece as spm except ImportError: - logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece") + logger.warning( + "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece" + ) self.do_lower_case = do_lower_case self.remove_space = remove_space @@ -103,24 +122,26 @@ class AlbertTokenizer(PreTrainedTokenizer): try: import sentencepiece as spm except ImportError: - logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece") + logger.warning( + "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece" + ) self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): if self.remove_space: - outputs = ' '.join(inputs.strip().split()) + outputs = " ".join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if six.PY2 and isinstance(outputs, str): - outputs = outputs.decode('utf-8') + outputs = outputs.decode("utf-8") if not self.keep_accents: - outputs = unicodedata.normalize('NFKD', outputs) - outputs = ''.join([c for c in outputs if not unicodedata.combining(c)]) + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if self.do_lower_case: outputs = outputs.lower() @@ -133,7 +154,7 @@ class AlbertTokenizer(PreTrainedTokenizer): text = self.preprocess_text(text) # note(zhiliny): in some systems, sentencepiece only accepts str for py2 if six.PY2 and isinstance(text, unicode): - text = text.encode('utf-8') + text = text.encode("utf-8") if not sample: pieces = self.sp_model.EncodeAsPieces(text) @@ -141,9 +162,8 @@ class AlbertTokenizer(PreTrainedTokenizer): pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) new_pieces = [] for piece in pieces: - if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit(): - cur_pieces = self.sp_model.EncodeAsPieces( - piece[:-1].replace(SPIECE_UNDERLINE, '')) + if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: cur_pieces = cur_pieces[1:] @@ -159,7 +179,7 @@ class AlbertTokenizer(PreTrainedTokenizer): ret_pieces = [] for piece in new_pieces: if isinstance(piece, str): - piece = piece.decode('utf-8') + piece = piece.decode("utf-8") ret_pieces.append(piece) new_pieces = ret_pieces @@ -173,12 +193,12 @@ class AlbertTokenizer(PreTrainedTokenizer): """Converts an index (integer) in a token (string/unicode) using the vocab.""" token = self.sp_model.IdToPiece(index) if six.PY2 and return_unicode and isinstance(token, str): - token = token.decode('utf-8') + token = token.decode("utf-8") return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -213,8 +233,10 @@ class AlbertTokenizer(PreTrainedTokenizer): if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: @@ -244,7 +266,7 @@ class AlbertTokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index 5377bd48cb..5d36fdcbaf 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -35,6 +35,7 @@ from .tokenization_xlm_roberta import XLMRobertaTokenizer logger = logging.getLogger(__name__) + class AutoTokenizer(object): r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library @@ -62,9 +63,12 @@ class AutoTokenizer(object): This class cannot be instantiated using `__init__()` (throw an error). """ + def __init__(self): - raise EnvironmentError("AutoTokenizer is designed to be instantiated " - "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.") + raise EnvironmentError( + "AutoTokenizer is designed to be instantiated " + "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method." + ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): @@ -125,34 +129,38 @@ class AutoTokenizer(object): tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') """ - if 't5' in pretrained_model_name_or_path: + if "t5" in pretrained_model_name_or_path: return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'distilbert' in pretrained_model_name_or_path: + elif "distilbert" in pretrained_model_name_or_path: return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'albert' in pretrained_model_name_or_path: + elif "albert" in pretrained_model_name_or_path: return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'camembert' in pretrained_model_name_or_path: + elif "camembert" in pretrained_model_name_or_path: return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'xlm-roberta' in pretrained_model_name_or_path: + elif "xlm-roberta" in pretrained_model_name_or_path: return XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'roberta' in pretrained_model_name_or_path: + elif "roberta" in pretrained_model_name_or_path: return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'bert-base-japanese' in pretrained_model_name_or_path: + elif "bert-base-japanese" in pretrained_model_name_or_path: return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'bert' in pretrained_model_name_or_path: + elif "bert" in pretrained_model_name_or_path: return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'openai-gpt' in pretrained_model_name_or_path: + elif "openai-gpt" in pretrained_model_name_or_path: return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'gpt2' in pretrained_model_name_or_path: + elif "gpt2" in pretrained_model_name_or_path: return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'transfo-xl' in pretrained_model_name_or_path: + elif "transfo-xl" in pretrained_model_name_or_path: return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'xlnet' in pretrained_model_name_or_path: + elif "xlnet" in pretrained_model_name_or_path: return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'xlm' in pretrained_model_name_or_path: + elif "xlm" in pretrained_model_name_or_path: return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'ctrl' in pretrained_model_name_or_path: + elif "ctrl" in pretrained_model_name_or_path: return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) + raise ValueError( + "Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format( + pretrained_model_name_or_path + ) + ) diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index edc26d88cf..7b3705cc19 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -26,69 +26,68 @@ from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", - 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", - 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", - 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", - 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", - 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", - 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", - 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", - 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", + "vocab_file": { + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", + "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", + "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", + "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", + "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", + "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'bert-base-uncased': 512, - 'bert-large-uncased': 512, - 'bert-base-cased': 512, - 'bert-large-cased': 512, - 'bert-base-multilingual-uncased': 512, - 'bert-base-multilingual-cased': 512, - 'bert-base-chinese': 512, - 'bert-base-german-cased': 512, - 'bert-large-uncased-whole-word-masking': 512, - 'bert-large-cased-whole-word-masking': 512, - 'bert-large-uncased-whole-word-masking-finetuned-squad': 512, - 'bert-large-cased-whole-word-masking-finetuned-squad': 512, - 'bert-base-cased-finetuned-mrpc': 512, - 'bert-base-german-dbmdz-cased': 512, - 'bert-base-german-dbmdz-uncased': 512, - 'bert-base-finnish-cased-v1': 512, - 'bert-base-finnish-uncased-v1': 512, + "bert-base-uncased": 512, + "bert-large-uncased": 512, + "bert-base-cased": 512, + "bert-large-cased": 512, + "bert-base-multilingual-uncased": 512, + "bert-base-multilingual-cased": 512, + "bert-base-chinese": 512, + "bert-base-german-cased": 512, + "bert-large-uncased-whole-word-masking": 512, + "bert-large-cased-whole-word-masking": 512, + "bert-large-uncased-whole-word-masking-finetuned-squad": 512, + "bert-large-cased-whole-word-masking-finetuned-squad": 512, + "bert-base-cased-finetuned-mrpc": 512, + "bert-base-german-dbmdz-cased": 512, + "bert-base-german-dbmdz-uncased": 512, + "bert-base-finnish-cased-v1": 512, + "bert-base-finnish-uncased-v1": 512, } PRETRAINED_INIT_CONFIGURATION = { - 'bert-base-uncased': {'do_lower_case': True}, - 'bert-large-uncased': {'do_lower_case': True}, - 'bert-base-cased': {'do_lower_case': False}, - 'bert-large-cased': {'do_lower_case': False}, - 'bert-base-multilingual-uncased': {'do_lower_case': True}, - 'bert-base-multilingual-cased': {'do_lower_case': False}, - 'bert-base-chinese': {'do_lower_case': False}, - 'bert-base-german-cased': {'do_lower_case': False}, - 'bert-large-uncased-whole-word-masking': {'do_lower_case': True}, - 'bert-large-cased-whole-word-masking': {'do_lower_case': False}, - 'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True}, - 'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False}, - 'bert-base-cased-finetuned-mrpc': {'do_lower_case': False}, - 'bert-base-german-dbmdz-cased': {'do_lower_case': False}, - 'bert-base-german-dbmdz-uncased': {'do_lower_case': True}, - 'bert-base-finnish-cased-v1': {'do_lower_case': False}, - 'bert-base-finnish-uncased-v1': {'do_lower_case': True}, + "bert-base-uncased": {"do_lower_case": True}, + "bert-large-uncased": {"do_lower_case": True}, + "bert-base-cased": {"do_lower_case": False}, + "bert-large-cased": {"do_lower_case": False}, + "bert-base-multilingual-uncased": {"do_lower_case": True}, + "bert-base-multilingual-cased": {"do_lower_case": False}, + "bert-base-chinese": {"do_lower_case": False}, + "bert-base-german-cased": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, + "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, + "bert-base-german-dbmdz-cased": {"do_lower_case": False}, + "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, + "bert-base-finnish-cased-v1": {"do_lower_case": False}, + "bert-base-finnish-uncased-v1": {"do_lower_case": True}, } @@ -98,7 +97,7 @@ def load_vocab(vocab_file): with open(vocab_file, "r", encoding="utf-8") as reader: tokens = reader.readlines() for index, token in enumerate(tokens): - token = token.rstrip('\n') + token = token.rstrip("\n") vocab[token] = index return vocab @@ -132,9 +131,20 @@ class BertTokenizer(PreTrainedTokenizer): pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, - unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", - mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs): + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + **kwargs + ): """Constructs a BertTokenizer. Args: @@ -152,24 +162,29 @@ class BertTokenizer(PreTrainedTokenizer): This should likely be deactivated for Japanese: see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 """ - super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, - pad_token=pad_token, cls_token=cls_token, - mask_token=mask_token, **kwargs) + super(BertTokenizer, self).__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict( - [(ids, tok) for tok, ids in self.vocab.items()]) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, - never_split=never_split, - tokenize_chinese_chars=tokenize_chinese_chars) + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars + ) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) @property @@ -196,7 +211,7 @@ class BertTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ' '.join(tokens).replace(' ##', '').strip() + out_string = " ".join(tokens).replace(" ##", "").strip() return out_string def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -231,8 +246,10 @@ class BertTokenizer(PreTrainedTokenizer): if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: @@ -258,16 +275,18 @@ class BertTokenizer(PreTrainedTokenizer): """Save the tokenizer vocabulary to a directory or file.""" index = 0 if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) else: vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: - logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file)) + logger.warning( + "Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file) + ) index = token_index - writer.write(token + u'\n') + writer.write(token + "\n") index += 1 return (vocab_file,) @@ -382,14 +401,16 @@ class BasicTokenizer(object): # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # return True return False @@ -399,7 +420,7 @@ class BasicTokenizer(object): output = [] for char in text: cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): + if cp == 0 or cp == 0xFFFD or _is_control(char): continue if _is_whitespace(char): output.append(" ") @@ -499,8 +520,7 @@ def _is_punctuation(char): # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): return True cat = unicodedata.category(char) if cat.startswith("P"): diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py index 0ff45cbfe7..48b9b04b4c 100644 --- a/transformers/tokenization_bert_japanese.py +++ b/transformers/tokenization_bert_japanese.py @@ -28,46 +28,45 @@ from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt", - 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt", - 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt", - 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt" + "vocab_file": { + "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt", + "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt", + "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt", + "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'bert-base-japanese': 512, - 'bert-base-japanese-whole-word-masking': 512, - 'bert-base-japanese-char': 512, - 'bert-base-japanese-char-whole-word-masking': 512 + "bert-base-japanese": 512, + "bert-base-japanese-whole-word-masking": 512, + "bert-base-japanese-char": 512, + "bert-base-japanese-char-whole-word-masking": 512, } PRETRAINED_INIT_CONFIGURATION = { - 'bert-base-japanese': { - 'do_lower_case': False, - 'word_tokenizer_type': 'mecab', - 'subword_tokenizer_type': 'wordpiece' + "bert-base-japanese": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "wordpiece", }, - 'bert-base-japanese-whole-word-masking':{ - 'do_lower_case': False, - 'word_tokenizer_type': 'mecab', - 'subword_tokenizer_type': 'wordpiece' + "bert-base-japanese-whole-word-masking": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "wordpiece", }, - 'bert-base-japanese-char': { - 'do_lower_case': False, - 'word_tokenizer_type': 'mecab', - 'subword_tokenizer_type': 'character' + "bert-base-japanese-char": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "character", + }, + "bert-base-japanese-char-whole-word-masking": { + "do_lower_case": False, + "word_tokenizer_type": "mecab", + "subword_tokenizer_type": "character", }, - 'bert-base-japanese-char-whole-word-masking': { - 'do_lower_case': False, - 'word_tokenizer_type': 'mecab', - 'subword_tokenizer_type': 'character' - } } @@ -79,11 +78,22 @@ class BertJapaneseTokenizer(BertTokenizer): pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, do_lower_case=False, - do_word_tokenize=True, do_subword_tokenize=True, - word_tokenizer_type='basic', subword_tokenizer_type='wordpiece', - never_split=None, unk_token='[UNK]', sep_token='[SEP]', - pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): + def __init__( + self, + vocab_file, + do_lower_case=False, + do_word_tokenize=True, + do_subword_tokenize=True, + word_tokenizer_type="basic", + subword_tokenizer_type="wordpiece", + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs + ): """Constructs a MecabBertTokenizer. Args: @@ -100,56 +110,53 @@ class BertJapaneseTokenizer(BertTokenizer): **subword_tokenizer_type**: (`optional`) string (default "wordpiece") Type of subword tokenizer. """ - super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, - pad_token=pad_token, cls_token=cls_token, - mask_token=mask_token, **kwargs) + super(BertTokenizer, self).__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict( - [(ids, tok) for tok, ids in self.vocab.items()]) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) self.do_word_tokenize = do_word_tokenize if do_word_tokenize: - if word_tokenizer_type == 'basic': - self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, - never_split=never_split, - tokenize_chinese_chars=False) - elif word_tokenizer_type == 'mecab': - self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, - never_split=never_split) + if word_tokenizer_type == "basic": + self.word_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False + ) + elif word_tokenizer_type == "mecab": + self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, never_split=never_split) else: - raise ValueError( - "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type)) + raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type)) self.do_subword_tokenize = do_subword_tokenize if do_subword_tokenize: - if subword_tokenizer_type == 'wordpiece': - self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, - unk_token=self.unk_token) - elif subword_tokenizer_type == 'character': - self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, - unk_token=self.unk_token) + if subword_tokenizer_type == "wordpiece": + self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + elif subword_tokenizer_type == "character": + self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token) else: - raise ValueError( - "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) - + raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) def _tokenize(self, text): if self.do_word_tokenize: - tokens = self.word_tokenizer.tokenize(text, - never_split=self.all_special_tokens) + tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens) else: tokens = [text] if self.do_subword_tokenize: - split_tokens = [sub_token for token in tokens - for sub_token in self.subword_tokenizer.tokenize(token)] + split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)] else: split_tokens = tokens @@ -177,27 +184,28 @@ class MecabTokenizer(object): self.normalize_text = normalize_text import MeCab + self.mecab = MeCab.Tagger() def tokenize(self, text, never_split=None, **kwargs): """Tokenizes a piece of text.""" if self.normalize_text: - text = unicodedata.normalize('NFKC', text) + text = unicodedata.normalize("NFKC", text) never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] if six.PY2: - mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8') + mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8") else: mecab_output = self.mecab.parse(text) cursor = 0 - for line in mecab_output.split('\n'): - if line == 'EOS': + for line in mecab_output.split("\n"): + if line == "EOS": break - token, _ = line.split('\t') + token, _ = line.split("\t") token_start = text.index(token, cursor) token_end = token_start + len(token) if self.do_lower_case and token not in never_split: @@ -240,7 +248,7 @@ class CharacterTokenizer(object): A list of characters. """ if self.normalize_text: - text = unicodedata.normalize('NFKC', text) + text = unicodedata.normalize("NFKC", text) output_tokens = [] for i, char in enumerate(text): diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py index 4c4615eb3d..c1e80e0e05 100644 --- a/transformers/tokenization_camembert.py +++ b/transformers/tokenization_camembert.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License """ Tokenization classes for Camembert model.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging import os @@ -26,19 +25,19 @@ from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model", + "vocab_file": { + "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'camembert-base': None, + "camembert-base": None, } + class CamembertTokenizer(PreTrainedTokenizer): """ Adapted from RobertaTokenizer and XLNetTokenizer @@ -46,17 +45,36 @@ class CamembertTokenizer(PreTrainedTokenizer): - requires `SentencePiece `_ """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, bos_token="", eos_token="", sep_token="", - cls_token="", unk_token="", pad_token='', mask_token='', - additional_special_tokens=['NOTUSED', 'NOTUSED'], **kwargs): - super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, - sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, - mask_token=mask_token, additional_special_tokens=additional_special_tokens, - **kwargs) + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED"], + **kwargs + ): + super(CamembertTokenizer, self).__init__( + max_len=512, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.sp_model = spm.SentencePieceProcessor() @@ -64,9 +82,9 @@ class CamembertTokenizer(PreTrainedTokenizer): self.vocab_file = vocab_file # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual # sentencepiece vocabulary (this is the case for and - self.fairseq_tokens_to_ids = {'NOTUSED': 0, '': 1, 'NOTUSED': 2, '': 3} + self.fairseq_tokens_to_ids = {"NOTUSED": 0, "": 1, "NOTUSED": 2, "": 3} self.fairseq_offset = len(self.fairseq_tokens_to_ids) - self.fairseq_tokens_to_ids[''] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -100,8 +118,10 @@ class CamembertTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: @@ -148,7 +168,7 @@ class CamembertTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def save_vocabulary(self, save_directory): @@ -158,7 +178,7 @@ class CamembertTokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py index 219f17c404..2ce2bbf094 100644 --- a/transformers/tokenization_ctrl.py +++ b/transformers/tokenization_ctrl.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for Salesforce CTRL.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import json import logging @@ -27,23 +26,17 @@ from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { - 'vocab_file': 'vocab.json', - 'merges_file': 'merges.txt', + "vocab_file": "vocab.json", + "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json", - }, - 'merges_file': - { - 'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt", - }, + "vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",}, + "merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'ctrl': 256, + "ctrl": 256, } CONTROL_CODES = { @@ -104,6 +97,7 @@ CONTROL_CODES = { "multilingual": 128406, } + def get_pairs(word): """Return set of symbol pairs in a word. @@ -118,11 +112,13 @@ def get_pairs(word): pairs = set(pairs) return pairs + class CTRLTokenizer(PreTrainedTokenizer): """ CTRL BPE tokenizer. Peculiarities: - Byte-Pair-Encoding """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES @@ -130,14 +126,18 @@ class CTRLTokenizer(PreTrainedTokenizer): def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs) - self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens - self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_single_sentence = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) - self.decoder = {v:k for k,v in self.encoder.items()} - with open(merges_file, encoding='utf-8') as merges_handle: - merges = merges_handle.read().split('\n')[1:-1] + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @@ -150,14 +150,14 @@ class CTRLTokenizer(PreTrainedTokenizer): if token in self.cache: return self.cache[token] word = tuple(token) - word = tuple(list(word[:-1]) + [word[-1]+'']) + word = tuple(list(word[:-1]) + [word[-1] + ""]) pairs = get_pairs(word) if not pairs: return token while True: - bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram @@ -172,8 +172,8 @@ class CTRLTokenizer(PreTrainedTokenizer): new_word.extend(word[i:]) break - if word[i] == first and i < len(word)-1 and word[i+1] == second: - new_word.append(first+second) + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) i += 2 else: new_word.append(word[i]) @@ -184,7 +184,7 @@ class CTRLTokenizer(PreTrainedTokenizer): break else: pairs = get_pairs(word) - word = '@@ '.join(word) + word = "@@ ".join(word) word = word[:-4] self.cache[token] = word return word @@ -194,10 +194,10 @@ class CTRLTokenizer(PreTrainedTokenizer): """ split_tokens = [] - words = re.findall(r'\S+\n?', text) + words = re.findall(r"\S+\n?", text) for token in words: - split_tokens.extend([t for t in self.bpe(token).split(' ')]) + split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens def _convert_token_to_id(self, token): @@ -210,7 +210,7 @@ class CTRLTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ' '.join(tokens).replace('@@ ', '').strip() + out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string def save_vocabulary(self, save_directory): @@ -218,21 +218,23 @@ class CTRLTokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) - with open(vocab_file, 'w', encoding='utf-8') as f: + with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: - writer.write(u'#version: 0.2\n') + writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: - logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!".format(merge_file)) + logger.warning( + "Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file) + ) index = token_index - writer.write(' '.join(bpe_tokens) + u'\n') + writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py index 2f245d71dc..7fed1e4058 100644 --- a/transformers/tokenization_distilbert.py +++ b/transformers/tokenization_distilbert.py @@ -26,23 +26,22 @@ from .tokenization_bert import BertTokenizer logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", - 'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + "vocab_file": { + "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", + "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'distilbert-base-uncased': 512, - 'distilbert-base-uncased-distilled-squad': 512, - 'distilbert-base-german-cased': 512, - 'distilbert-base-multilingual-cased': 512, + "distilbert-base-uncased": 512, + "distilbert-base-uncased-distilled-squad": 512, + "distilbert-base-german-cased": 512, + "distilbert-base-multilingual-cased": 512, } diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py index 68c6101860..b6a0e7b78b 100644 --- a/transformers/tokenization_gpt2.py +++ b/transformers/tokenization_gpt2.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import sys import json @@ -31,42 +30,42 @@ except ImportError: def lru_cache(): return lambda func: func + from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { - 'vocab_file': 'vocab.json', - 'merges_file': 'merges.txt', + "vocab_file": "vocab.json", + "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", - 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", - 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", - 'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json", - 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json", + "vocab_file": { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", + "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json", }, - 'merges_file': - { - 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", - 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", - 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", - 'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt", - 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt", + "merges_file": { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", + "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'gpt2': 1024, - 'gpt2-medium': 1024, - 'gpt2-large': 1024, - 'gpt2-xl': 1024, - 'distilgpt2': 1024, + "gpt2": 1024, + "gpt2-medium": 1024, + "gpt2-large": 1024, + "gpt2-xl": 1024, + "distilgpt2": 1024, } + @lru_cache() def bytes_to_unicode(): """ @@ -80,17 +79,20 @@ def bytes_to_unicode(): To avoid that, we want lookup tables between utf-8 bytes and unicode strings. """ _chr = unichr if sys.version_info[0] == 2 else chr - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) cs = bs[:] n = 0 - for b in range(2**8): + for b in range(2 ** 8): if b not in bs: bs.append(b) - cs.append(2**8+n) + cs.append(2 ** 8 + n) n += 1 cs = [_chr(n) for n in cs] return dict(zip(bs, cs)) + def get_pairs(word): """Return set of symbol pairs in a word. @@ -103,6 +105,7 @@ def get_pairs(word): prev_char = char return pairs + class GPT2Tokenizer(PreTrainedTokenizer): """ GPT-2 BPE tokenizer. Peculiarities: @@ -112,15 +115,28 @@ class GPT2Tokenizer(PreTrainedTokenizer): Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"` """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", - bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + **kwargs + ): super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) - self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens - self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_single_sentence = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) @@ -128,8 +144,8 @@ class GPT2Tokenizer(PreTrainedTokenizer): self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - with open(merges_file, encoding='utf-8') as merges_handle: - bpe_merges = merges_handle.read().split('\n')[1:-1] + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] bpe_merges = [tuple(merge.split()) for merge in bpe_merges] self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} @@ -151,7 +167,7 @@ class GPT2Tokenizer(PreTrainedTokenizer): return token while True: - bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram @@ -166,8 +182,8 @@ class GPT2Tokenizer(PreTrainedTokenizer): new_word.extend(word[i:]) break - if word[i] == first and i < len(word)-1 and word[i+1] == second: - new_word.append(first+second) + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) i += 2 else: new_word.append(word[i]) @@ -178,7 +194,7 @@ class GPT2Tokenizer(PreTrainedTokenizer): break else: pairs = get_pairs(word) - word = ' '.join(word) + word = " ".join(word) self.cache[token] = word return word @@ -189,15 +205,19 @@ class GPT2Tokenizer(PreTrainedTokenizer): Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers. """ if add_prefix_space: - text = ' ' + text + text = " " + text bpe_tokens = [] for token in re.findall(self.pat, text): if sys.version_info[0] == 2: - token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) + token = "".join( + self.byte_encoder[ord(b)] for b in token + ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) else: - token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) - bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) return bpe_tokens def _convert_token_to_id(self, token): @@ -210,8 +230,8 @@ class GPT2Tokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - text = ''.join(tokens) - text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text def save_vocabulary(self, save_directory): @@ -219,21 +239,23 @@ class GPT2Tokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) - with open(vocab_file, 'w', encoding='utf-8') as f: + with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: - writer.write(u'#version: 0.2\n') + writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: - logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!".format(merge_file)) + logger.warning( + "Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file) + ) index = token_index - writer.write(' '.join(bpe_tokens) + u'\n') + writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py index a4c64b7020..d8f7549eda 100644 --- a/transformers/tokenization_openai.py +++ b/transformers/tokenization_openai.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import json import logging @@ -28,25 +27,20 @@ from .tokenization_bert import BasicTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { - 'vocab_file': 'vocab.json', - 'merges_file': 'merges.txt', + "vocab_file": "vocab.json", + "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json", - }, - 'merges_file': - { - 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt", - }, + "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",}, + "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",}, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'openai-gpt': 512, + "openai-gpt": 512, } + def get_pairs(word): """ Return set of symbol pairs in a word. @@ -59,27 +53,30 @@ def get_pairs(word): prev_char = char return pairs + def text_standardize(text): """ fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization """ - text = text.replace('—', '-') - text = text.replace('–', '-') - text = text.replace('―', '-') - text = text.replace('…', '...') - text = text.replace('´', "'") - text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) - text = re.sub(r'\s*\n\s*', ' \n ', text) - text = re.sub(r'[^\S\n]+', ' ', text) + text = text.replace("—", "-") + text = text.replace("–", "-") + text = text.replace("―", "-") + text = text.replace("…", "...") + text = text.replace("´", "'") + text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text) + text = re.sub(r"\s*\n\s*", " \n ", text) + text = re.sub(r"[^\S\n]+", " ", text) return text.strip() + class OpenAIGPTTokenizer(PreTrainedTokenizer): """ BPE tokenizer. Peculiarities: - lower case all inputs - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES @@ -87,12 +84,17 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) - self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens - self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_single_sentence = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens try: import ftfy from spacy.lang.en import English + _nlp = English() self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text @@ -103,9 +105,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) - self.decoder = {v:k for k,v in self.encoder.items()} - with open(merges_file, encoding='utf-8') as merges_handle: - merges = merges_handle.read().split('\n')[1:-1] + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @@ -115,16 +117,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): return len(self.encoder) def bpe(self, token): - word = tuple(token[:-1]) + (token[-1] + '',) + word = tuple(token[:-1]) + (token[-1] + "",) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: - return token+'' + return token + "" while True: - bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram @@ -139,8 +141,8 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): new_word.extend(word[i:]) break - if word[i] == first and i < len(word)-1 and word[i+1] == second: - new_word.append(first+second) + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) i += 2 else: new_word.append(word[i]) @@ -151,9 +153,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): break else: pairs = get_pairs(word) - word = ' '.join(word) - if word == '\n ': - word = '\n' + word = " ".join(word) + if word == "\n ": + word = "\n" self.cache[token] = word return word @@ -164,12 +166,12 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): # Using BERT's BasicTokenizer text = self.nlp.tokenize(text) for token in text: - split_tokens.extend([t for t in self.bpe(token).split(' ')]) + split_tokens.extend([t for t in self.bpe(token).split(" ")]) else: # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) text = self.nlp(text_standardize(self.fix_text(text))) for token in text: - split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) + split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")]) return split_tokens def _convert_token_to_id(self, token): @@ -182,7 +184,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ''.join(tokens).replace('', ' ').strip() + out_string = "".join(tokens).replace("", " ").strip() return out_string def save_vocabulary(self, save_directory): @@ -190,21 +192,23 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) - with open(vocab_file, 'w', encoding='utf-8') as f: + with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: - writer.write(u'#version: 0.2\n') + writer.write("#version: 0.2\n") for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: - logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!".format(merge_file)) + logger.warning( + "Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file) + ) index = token_index - writer.write(' '.join(bpe_tokens) + u'\n') + writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index b44e004997..eae8b638fe 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for RoBERTa.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import sys import json @@ -33,41 +32,40 @@ except ImportError: def lru_cache(): return lambda func: func + logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { - 'vocab_file': 'vocab.json', - 'merges_file': 'merges.txt', + "vocab_file": "vocab.json", + "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", - 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", - 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", - 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json", - 'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", - 'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", + "vocab_file": { + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json", + "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", + "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", }, - 'merges_file': - { - 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", - 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", - 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", - 'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt", - 'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", - 'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", + "merges_file": { + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt", + "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", + "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'roberta-base': 512, - 'roberta-large': 512, - 'roberta-large-mnli': 512, - 'distilroberta-base': 512, - 'roberta-base-openai-detector': 512, - 'roberta-large-openai-detector': 512, + "roberta-base": 512, + "roberta-large": 512, + "roberta-large-mnli": 512, + "distilroberta-base": 512, + "roberta-base-openai-detector": 512, + "roberta-large-openai-detector": 512, } @@ -80,16 +78,38 @@ class RobertaTokenizer(GPT2Tokenizer): Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="", - cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): - super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors, - bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, - sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, - mask_token=mask_token, **kwargs) + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + **kwargs + ): + super(RobertaTokenizer, self).__init__( + vocab_file=vocab_file, + merges_file=merges_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens @@ -124,8 +144,10 @@ class RobertaTokenizer(GPT2Tokenizer): """ if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index 9fd37b67c0..3b70d40857 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -26,26 +26,25 @@ from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) -SPIECE_UNDERLINE = u'▁' +SPIECE_UNDERLINE = "▁" #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances #################################################### -VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'} +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to pretrained vocabulary URL for all the model shortcut names. #################################################### PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "vocab_file": { + "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", } } @@ -53,13 +52,14 @@ PRETRAINED_VOCAB_FILES_MAP = { # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 't5-small': 512, - 't5-base': 512, - 't5-large': 512, - 't5-3b': 512, - 't5-11b': 512, + "t5-small": 512, + "t5-base": 512, + "t5-large": 512, + "t5-3b": 512, + "t5-11b": 512, } + class T5Tokenizer(PreTrainedTokenizer): """ SentencePiece based tokenizer. Peculiarities: @@ -71,28 +71,43 @@ class T5Tokenizer(PreTrainedTokenizer): (like in T5 preprocessing see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117) """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, eos_token="", unk_token="", - pad_token="", extra_ids=100, additional_special_tokens=None, **kwargs): + def __init__( + self, + vocab_file, + eos_token="", + unk_token="", + pad_token="", + extra_ids=100, + additional_special_tokens=None, + **kwargs + ): # Add extra_ids to the special token list if extra_ids > 0: if additional_special_tokens is None: additional_special_tokens = [] - additional_special_tokens.extend([u"".format(i) for i in range(extra_ids)]) + additional_special_tokens.extend(["".format(i) for i in range(extra_ids)]) - super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token, - pad_token=pad_token, additional_special_tokens=additional_special_tokens, - **kwargs) + super(T5Tokenizer, self).__init__( + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + additional_special_tokens=additional_special_tokens, + **kwargs + ) try: import sentencepiece as spm except ImportError: - logger.warning("You need to install SentencePiece to use T5Tokenizer:" - "https://github.com/google/sentencepiece" - "pip install sentencepiece") + logger.warning( + "You need to install SentencePiece to use T5Tokenizer:" + "https://github.com/google/sentencepiece" + "pip install sentencepiece" + ) self.vocab_file = vocab_file self._extra_ids = extra_ids @@ -114,8 +129,10 @@ class T5Tokenizer(PreTrainedTokenizer): try: import sentencepiece as spm except ImportError: - logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece") + logger.warning( + "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece" + ) self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -132,7 +149,7 @@ class T5Tokenizer(PreTrainedTokenizer): ret_pieces = [] for piece in pieces: if isinstance(piece, str): - piece = piece.decode('utf-8') + piece = piece.decode("utf-8") ret_pieces.append(piece) pieces = ret_pieces @@ -140,8 +157,8 @@ class T5Tokenizer(PreTrainedTokenizer): def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ - if token.startswith(u"', token) + if token.startswith("", token) num = int(l.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) @@ -151,9 +168,9 @@ class T5Tokenizer(PreTrainedTokenizer): if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) else: - token = u"".format(self.vocab_size - 1 - index) + token = "".format(self.vocab_size - 1 - index) if six.PY2 and return_unicode and isinstance(token, str): - token = token.decode('utf-8') + token = token.decode("utf-8") return token def convert_tokens_to_string(self, tokens): @@ -168,7 +185,7 @@ class T5Tokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/transformers/tokenization_transfo_xl.py b/transformers/tokenization_transfo_xl.py index 8d5a0ce9d4..b2f59625f9 100644 --- a/transformers/tokenization_transfo_xl.py +++ b/transformers/tokenization_transfo_xl.py @@ -16,8 +16,7 @@ """ Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. """ -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import glob import logging @@ -44,42 +43,58 @@ except ImportError: logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"} PRETRAINED_VOCAB_FILES_MAP = { - 'pretrained_vocab_file': - { - 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin", + "pretrained_vocab_file": { + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'transfo-xl-wt103': None, + "transfo-xl-wt103": None, } PRETRAINED_CORPUS_ARCHIVE_MAP = { - 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin", + "transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin", } -CORPUS_NAME = 'corpus.bin' +CORPUS_NAME = "corpus.bin" + class TransfoXLTokenizer(PreTrainedTokenizer): """ Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, - delimiter=None, vocab_file=None, pretrained_vocab_file=None, - never_split=None, unk_token="", eos_token="", - additional_special_tokens=[""], **kwargs): - super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token, - additional_special_tokens=additional_special_tokens, - **kwargs) + def __init__( + self, + special=None, + min_freq=0, + max_size=None, + lower_case=False, + delimiter=None, + vocab_file=None, + pretrained_vocab_file=None, + never_split=None, + unk_token="", + eos_token="", + additional_special_tokens=[""], + **kwargs + ): + super(TransfoXLTokenizer, self).__init__( + unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs + ) - self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens - self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_single_sentence = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = ( + self.max_len + ) # no default special tokens - you can update this value if you add special tokens if never_split is None: never_split = self.all_special_tokens @@ -106,14 +121,15 @@ class TransfoXLTokenizer(PreTrainedTokenizer): self.build_vocab() def count_file(self, path, verbose=False, add_eos=False): - if verbose: logger.info('counting file {} ...'.format(path)) + if verbose: + logger.info("counting file {} ...".format(path)) assert os.path.exists(path) sents = [] - with open(path, 'r', encoding='utf-8') as f: + with open(path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: - logger.info(' line {}'.format(idx)) + logger.info(" line {}".format(idx)) symbols = self.tokenize(line, add_eos=add_eos) self.counter.update(symbols) sents.append(symbols) @@ -124,42 +140,42 @@ class TransfoXLTokenizer(PreTrainedTokenizer): """ sents : a list of sentences, each a list of tokenized symbols """ - if verbose: logger.info('counting {} sents ...'.format(len(sents))) + if verbose: + logger.info("counting {} sents ...".format(len(sents))) for idx, symbols in enumerate(sents): if verbose and idx > 0 and idx % 500000 == 0: - logger.info(' line {}'.format(idx)) + logger.info(" line {}".format(idx)) self.counter.update(symbols) def _build_from_file(self, vocab_file): self.idx2sym = [] self.sym2idx = OrderedDict() - with open(vocab_file, 'r', encoding='utf-8') as f: + with open(vocab_file, "r", encoding="utf-8") as f: for line in f: symb = line.strip().split()[0] self.add_symbol(symb) - if '' in self.sym2idx: - self.unk_idx = self.sym2idx[''] - elif '' in self.sym2idx: - self.unk_idx = self.sym2idx[''] + if "" in self.sym2idx: + self.unk_idx = self.sym2idx[""] + elif "" in self.sym2idx: + self.unk_idx = self.sym2idx[""] else: - raise ValueError('No token in vocabulary') + raise ValueError("No token in vocabulary") def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary to a directory or file.""" if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file']) + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"]) torch.save(self.__dict__, vocab_file) return (vocab_file,) def build_vocab(self): if self.vocab_file: - logger.info('building vocab from {}'.format(self.vocab_file)) + logger.info("building vocab from {}".format(self.vocab_file)) self._build_from_file(self.vocab_file) - logger.info('final vocab size {}'.format(len(self))) + logger.info("final vocab size {}".format(len(self))) else: - logger.info('building vocab with min_freq={}, max_size={}'.format( - self.min_freq, self.max_size)) + logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size)) self.idx2sym = [] self.sym2idx = OrderedDict() @@ -167,23 +183,22 @@ class TransfoXLTokenizer(PreTrainedTokenizer): self.add_special(sym) for sym, cnt in self.counter.most_common(self.max_size): - if cnt < self.min_freq: break + if cnt < self.min_freq: + break self.add_symbol(sym) - logger.info('final vocab size {} from {} unique tokens'.format( - len(self), len(self.counter))) + logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter))) - def encode_file(self, path, ordered=False, verbose=False, add_eos=True, - add_double_eos=False): - if verbose: logger.info('encoding file {} ...'.format(path)) + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False): + if verbose: + logger.info("encoding file {} ...".format(path)) assert os.path.exists(path) encoded = [] - with open(path, 'r', encoding='utf-8') as f: + with open(path, "r", encoding="utf-8") as f: for idx, line in enumerate(f): if verbose and idx > 0 and idx % 500000 == 0: - logger.info(' line {}'.format(idx)) - symbols = self.tokenize(line, add_eos=add_eos, - add_double_eos=add_double_eos) + logger.info(" line {}".format(idx)) + symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos) encoded.append(self.convert_to_tensor(symbols)) if ordered: @@ -192,11 +207,12 @@ class TransfoXLTokenizer(PreTrainedTokenizer): return encoded def encode_sents(self, sents, ordered=False, verbose=False): - if verbose: logger.info('encoding {} sents ...'.format(len(sents))) + if verbose: + logger.info("encoding {} sents ...".format(len(sents))) encoded = [] for idx, symbols in enumerate(sents): if verbose and idx > 0 and idx % 500000 == 0: - logger.info(' line {}'.format(idx)) + logger.info(" line {}".format(idx)) encoded.append(self.convert_to_tensor(symbols)) if ordered: @@ -208,7 +224,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): if sym not in self.sym2idx: self.idx2sym.append(sym) self.sym2idx[sym] = len(self.idx2sym) - 1 - setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) + setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym]) def add_symbol(self, sym): if sym not in self.sym2idx: @@ -217,7 +233,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): def _convert_id_to_token(self, idx): """Converts an id in a token (BPE) using the vocab.""" - assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx) + assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx) return self.idx2sym[idx] def _convert_token_to_id(self, sym): @@ -227,19 +243,19 @@ class TransfoXLTokenizer(PreTrainedTokenizer): else: # logger.info('encounter unk {}'.format(sym)) # assert '' not in sym - if hasattr(self, 'unk_idx'): + if hasattr(self, "unk_idx"): return self.sym2idx.get(sym, self.unk_idx) # Backward compatibility with pre-trained models - elif '' in self.sym2idx: - return self.sym2idx[''] - elif '' in self.sym2idx: - return self.sym2idx[''] + elif "" in self.sym2idx: + return self.sym2idx[""] + elif "" in self.sym2idx: + return self.sym2idx[""] else: - raise ValueError('Token not in vocabulary and no token in vocabulary for replacement') + raise ValueError("Token not in vocabulary and no token in vocabulary for replacement") def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ' '.join(tokens).strip() + out_string = " ".join(tokens).strip() return out_string def convert_to_tensor(self, symbols): @@ -256,21 +272,21 @@ class TransfoXLTokenizer(PreTrainedTokenizer): line = line.lower() # empty delimiter '' will evaluate False - if self.delimiter == '': + if self.delimiter == "": symbols = line else: symbols = line.split(self.delimiter) - if add_double_eos: # lm1b - return [''] + symbols + [''] + if add_double_eos: # lm1b + return [""] + symbols + [""] elif add_eos: - return symbols + [''] + return symbols + [""] else: return symbols class LMOrderedIterator(object): - def __init__(self, data, bsz, bptt, device='cpu', ext_len=None): + def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): """ data -- LongTensor -- the LongTensor is strictly ordered """ @@ -293,14 +309,15 @@ class LMOrderedIterator(object): self.n_batch = (self.n_step + self.bptt - 1) // self.bptt def get_batch(self, i, bptt=None): - if bptt is None: bptt = self.bptt + if bptt is None: + bptt = self.bptt seq_len = min(bptt, self.data.size(0) - 1 - i) end_idx = i + seq_len beg_idx = max(0, i - self.ext_len) data = self.data[beg_idx:end_idx] - target = self.data[i+1:i+1+seq_len] + target = self.data[i + 1 : i + 1 + seq_len] data_out = data.transpose(0, 1).contiguous().to(self.device) target_out = target.transpose(0, 1).contiguous().to(self.device) @@ -315,7 +332,7 @@ class LMOrderedIterator(object): max_len = self.bptt + max_deviation * std i = start while True: - bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2. + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0 bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) data, target, seq_len = self.get_batch(i, bptt) i += seq_len @@ -328,7 +345,7 @@ class LMOrderedIterator(object): class LMShuffledIterator(object): - def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False): + def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False): """ data -- list[LongTensor] -- there is no order among the LongTensors """ @@ -343,8 +360,7 @@ class LMShuffledIterator(object): def get_sent_stream(self): # index iterator - epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \ - else np.array(range(len(self.data))) + epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data))) # sentence iterator for idx in epoch_indices: @@ -376,10 +392,8 @@ class LMShuffledIterator(object): # number of new tokens to fill in n_new = min(len(streams[i]) - 1, self.bptt - n_filled) # first n_retain tokens are retained from last batch - data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \ - streams[i][:n_new] - target[n_filled:n_filled+n_new, i] = \ - streams[i][1:n_new+1] + data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new] + target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1] streams[i] = streams[i][n_new:] n_filled += n_new except StopIteration: @@ -408,8 +422,7 @@ class LMShuffledIterator(object): class LMMultiFileIterator(LMShuffledIterator): - def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None, - shuffle=False): + def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False): self.paths = paths self.vocab = vocab @@ -460,15 +473,16 @@ class TransfoXLCorpus(object): "We assumed '{}' was a path or url but couldn't find files {} " "at this path or url.".format( pretrained_model_name_or_path, - ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()), + ", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, - corpus_file)) + corpus_file, + ) + ) return None if resolved_corpus_file == corpus_file: logger.info("loading corpus file {}".format(corpus_file)) else: - logger.info("loading corpus file {} from cache at {}".format( - corpus_file, resolved_corpus_file)) + logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file)) # Instantiate tokenizer. corpus = cls(*inputs, **kwargs) @@ -494,83 +508,78 @@ class TransfoXLCorpus(object): def build_corpus(self, path, dataset): self.dataset = dataset - if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']: - self.vocab.count_file(os.path.join(path, 'train.txt')) - self.vocab.count_file(os.path.join(path, 'valid.txt')) - self.vocab.count_file(os.path.join(path, 'test.txt')) - elif self.dataset == 'wt103': - self.vocab.count_file(os.path.join(path, 'train.txt')) - elif self.dataset == 'lm1b': + if self.dataset in ["ptb", "wt2", "enwik8", "text8"]: + self.vocab.count_file(os.path.join(path, "train.txt")) + self.vocab.count_file(os.path.join(path, "valid.txt")) + self.vocab.count_file(os.path.join(path, "test.txt")) + elif self.dataset == "wt103": + self.vocab.count_file(os.path.join(path, "train.txt")) + elif self.dataset == "lm1b": train_path_pattern = os.path.join( - path, '1-billion-word-language-modeling-benchmark-r13output', - 'training-monolingual.tokenized.shuffled', 'news.en-*') + path, + "1-billion-word-language-modeling-benchmark-r13output", + "training-monolingual.tokenized.shuffled", + "news.en-*", + ) train_paths = glob.glob(train_path_pattern) # the vocab will load from file when build_vocab() is called self.vocab.build_vocab() - if self.dataset in ['ptb', 'wt2', 'wt103']: - self.train = self.vocab.encode_file( - os.path.join(path, 'train.txt'), ordered=True) - self.valid = self.vocab.encode_file( - os.path.join(path, 'valid.txt'), ordered=True) - self.test = self.vocab.encode_file( - os.path.join(path, 'test.txt'), ordered=True) - elif self.dataset in ['enwik8', 'text8']: - self.train = self.vocab.encode_file( - os.path.join(path, 'train.txt'), ordered=True, add_eos=False) - self.valid = self.vocab.encode_file( - os.path.join(path, 'valid.txt'), ordered=True, add_eos=False) - self.test = self.vocab.encode_file( - os.path.join(path, 'test.txt'), ordered=True, add_eos=False) - elif self.dataset == 'lm1b': + if self.dataset in ["ptb", "wt2", "wt103"]: + self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True) + self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True) + self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True) + elif self.dataset in ["enwik8", "text8"]: + self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False) + self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False) + elif self.dataset == "lm1b": self.train = train_paths - self.valid = self.vocab.encode_file( - os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True) - self.test = self.vocab.encode_file( - os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True) + self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True) + self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True) def get_iterator(self, split, *args, **kwargs): - if split == 'train': - if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + if split == "train": + if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: data_iter = LMOrderedIterator(self.train, *args, **kwargs) - elif self.dataset == 'lm1b': - kwargs['shuffle'] = True + elif self.dataset == "lm1b": + kwargs["shuffle"] = True data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) - elif split in ['valid', 'test']: - data = self.valid if split == 'valid' else self.test - if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + elif split in ["valid", "test"]: + data = self.valid if split == "valid" else self.test + if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]: data_iter = LMOrderedIterator(data, *args, **kwargs) - elif self.dataset == 'lm1b': + elif self.dataset == "lm1b": data_iter = LMShuffledIterator(data, *args, **kwargs) return data_iter def get_lm_corpus(datadir, dataset): - fn = os.path.join(datadir, 'cache.pt') - fn_pickle = os.path.join(datadir, 'cache.pkl') + fn = os.path.join(datadir, "cache.pt") + fn_pickle = os.path.join(datadir, "cache.pkl") if os.path.exists(fn): - logger.info('Loading cached dataset...') + logger.info("Loading cached dataset...") corpus = torch.load(fn_pickle) elif os.path.exists(fn): - logger.info('Loading cached dataset from pickle...') + logger.info("Loading cached dataset from pickle...") with open(fn, "rb") as fp: corpus = pickle.load(fp) else: - logger.info('Producing dataset {}...'.format(dataset)) + logger.info("Producing dataset {}...".format(dataset)) kwargs = {} - if dataset in ['wt103', 'wt2']: - kwargs['special'] = [''] - kwargs['lower_case'] = False - elif dataset == 'ptb': - kwargs['special'] = [''] - kwargs['lower_case'] = True - elif dataset == 'lm1b': - kwargs['special'] = [] - kwargs['lower_case'] = False - kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt') - elif dataset in ['enwik8', 'text8']: + if dataset in ["wt103", "wt2"]: + kwargs["special"] = [""] + kwargs["lower_case"] = False + elif dataset == "ptb": + kwargs["special"] = [""] + kwargs["lower_case"] = True + elif dataset == "lm1b": + kwargs["special"] = [] + kwargs["lower_case"] = False + kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt") + elif dataset in ["enwik8", "text8"]: pass corpus = TransfoXLCorpus(datadir, dataset, **kwargs) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 33a59643f5..f848785ee2 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for OpenAI GPT.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging import os @@ -34,9 +33,10 @@ if is_torch_available(): logger = logging.getLogger(__name__) -SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json' -ADDED_TOKENS_FILE = 'added_tokens.json' -TOKENIZER_CONFIG_FILE = 'tokenizer_config.json' +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + class PreTrainedTokenizer(object): """ Base class for all tokenizers. @@ -69,14 +69,22 @@ class PreTrainedTokenizer(object): - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` """ + vocab_files_names = {} pretrained_vocab_files_map = {} pretrained_init_configuration = {} max_model_input_sizes = {} - SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token", - "pad_token", "cls_token", "mask_token", - "additional_special_tokens"] + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] padding_side = "right" @@ -227,8 +235,8 @@ class PreTrainedTokenizer(object): self.max_len = max_len if max_len is not None else int(1e12) # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed. - self.padding_side = kwargs.pop('padding_side', self.padding_side) - + self.padding_side = kwargs.pop("padding_side", self.padding_side) + # Added tokens self.added_tokens_encoder = {} self.unique_added_tokens_encoder = set() @@ -240,13 +248,14 @@ class PreTrainedTokenizer(object): for key, value in kwargs.items(): if key in self.SPECIAL_TOKENS_ATTRIBUTES: - if key == 'additional_special_tokens': - assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value) + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all( + isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value + ) else: assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) setattr(self, key, value) - @classmethod def from_pretrained(cls, *inputs, **kwargs): r""" @@ -302,13 +311,12 @@ class PreTrainedTokenizer(object): """ return cls._from_pretrained(*inputs, **kwargs) - @classmethod def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): - cache_dir = kwargs.pop('cache_dir', None) - force_download = kwargs.pop('force_download', False) - resume_download = kwargs.pop('resume_download', False) - proxies = kwargs.pop('proxies', None) + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} @@ -317,15 +325,19 @@ class PreTrainedTokenizer(object): # Get the vocabulary from AWS S3 bucket for file_id, map_list in cls.pretrained_vocab_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] - if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration: + if ( + cls.pretrained_init_configuration + and pretrained_model_name_or_path in cls.pretrained_init_configuration + ): init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path] else: # Get the vocabulary from local files logger.info( "Model name '{}' not found in model shortcut name list ({}). " "Assuming '{}' is a path or url to a directory containing tokenizer files.".format( - pretrained_model_name_or_path, ', '.join(s3_models), - pretrained_model_name_or_path)) + pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path + ) + ) # Look for the tokenizer main vocabulary files for file_id, file_name in cls.vocab_files_names.items(): @@ -340,14 +352,15 @@ class PreTrainedTokenizer(object): full_file_name = pretrained_model_name_or_path else: full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name) - + vocab_files[file_id] = full_file_name # Look for the additional tokens files - additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, - 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE, - 'tokenizer_config_file': TOKENIZER_CONFIG_FILE, - } + additional_files_names = { + "added_tokens_file": ADDED_TOKENS_FILE, + "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, + "tokenizer_config_file": TOKENIZER_CONFIG_FILE, + } # If a path to a file was provided, get the parent directory saved_directory = pretrained_model_name_or_path @@ -366,9 +379,12 @@ class PreTrainedTokenizer(object): "Model name '{}' was not found in tokenizers model name list ({}). " "We assumed '{}' was a path or url to a directory containing vocabulary files " "named {} but couldn't find such vocabulary files at this path or url.".format( - pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path, - list(cls.vocab_files_names.values()))) + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) # Get files from url, cache, or disk depending on the case try: @@ -377,17 +393,27 @@ class PreTrainedTokenizer(object): if file_path is None: resolved_vocab_files[file_id] = None else: - resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download) + resolved_vocab_files[file_id] = cached_path( + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + ) except EnvironmentError: if pretrained_model_name_or_path in s3_models: msg = "Couldn't reach server at '{}' to download vocabulary files." else: - msg = "Model name '{}' was not found in tokenizers model name list ({}). " \ - "We assumed '{}' was a path or url to a directory containing vocabulary files " \ + msg = ( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path or url to a directory containing vocabulary files " "named {}, but couldn't find such vocabulary files at this path or url.".format( - pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path, - list(cls.vocab_files_names.values())) + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) raise EnvironmentError(msg) @@ -395,16 +421,15 @@ class PreTrainedTokenizer(object): if file_path == resolved_vocab_files[file_id]: logger.info("loading file {}".format(file_path)) else: - logger.info("loading file {} from cache at {}".format( - file_path, resolved_vocab_files[file_id])) + logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? - tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None) + tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) if tokenizer_config_file is not None: with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: init_kwargs = json.load(tokenizer_config_handle) - saved_init_inputs = init_kwargs.pop('init_inputs', ()) + saved_init_inputs = init_kwargs.pop("init_inputs", ()) if not init_inputs: init_inputs = saved_init_inputs else: @@ -419,11 +444,11 @@ class PreTrainedTokenizer(object): # wont index sequences longer than the number of positional embeddings max_len = cls.max_model_input_sizes[pretrained_model_name_or_path] if max_len is not None and isinstance(max_len, (int, float)): - init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len) + init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len) # Merge resolved_vocab_files arguments in init_kwargs. - added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None) - special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None) + added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) + special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) for args_name, file_path in resolved_vocab_files.items(): if args_name not in init_kwargs: init_kwargs[args_name] = file_path @@ -438,8 +463,10 @@ class PreTrainedTokenizer(object): try: tokenizer = cls(*init_inputs, **init_kwargs) except OSError: - OSError("Unable to load vocabulary from file. " - "Please check that the provided vocabulary is accessible and not corrupted.") + OSError( + "Unable to load vocabulary from file. " + "Please check that the provided vocabulary is accessible and not corrupted." + ) # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` tokenizer.init_inputs = init_inputs @@ -449,13 +476,12 @@ class PreTrainedTokenizer(object): if added_tokens_file is not None: with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: added_tok_encoder = json.load(added_tokens_handle) - added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} + added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} tokenizer.added_tokens_encoder.update(added_tok_encoder) tokenizer.added_tokens_decoder.update(added_tok_decoder) return tokenizer - def save_pretrained(self, save_directory): """ Save the tokenizer vocabulary files together with: - added tokens, @@ -476,28 +502,27 @@ class PreTrainedTokenizer(object): tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) tokenizer_config = copy.deepcopy(self.init_kwargs) - tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs) + tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) for file_id in self.vocab_files_names.keys(): tokenizer_config.pop(file_id, None) - with open(tokenizer_config_file, 'w', encoding='utf-8') as f: + with open(tokenizer_config_file, "w", encoding="utf-8") as f: f.write(json.dumps(tokenizer_config, ensure_ascii=False)) - with open(special_tokens_map_file, 'w', encoding='utf-8') as f: + with open(special_tokens_map_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) - with open(added_tokens_file, 'w', encoding='utf-8') as f: + with open(added_tokens_file, "w", encoding="utf-8") as f: if self.added_tokens_encoder: out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) else: - out_str = u"{}" + out_str = "{}" f.write(out_str) vocab_files = self.save_vocabulary(save_directory) return vocab_files + (special_tokens_map_file, added_tokens_file) - def save_vocabulary(self, save_directory): """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens and special token mappings. @@ -506,17 +531,14 @@ class PreTrainedTokenizer(object): """ raise NotImplementedError - def vocab_size(self): """ Size of the base vocabulary (without the added tokens) """ raise NotImplementedError - def __len__(self): """ Size of the full vocabulary with the added tokens """ return self.vocab_size + len(self.added_tokens_encoder) - def add_tokens(self, new_tokens): """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the @@ -544,16 +566,18 @@ class PreTrainedTokenizer(object): to_add_tokens = [] for token in new_tokens: assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) - if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens: + if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens: token = token.lower() - if token != self.unk_token and \ - self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \ - token not in to_add_tokens: + if ( + token != self.unk_token + and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) + and token not in to_add_tokens + ): to_add_tokens.append(token) logger.info("Adding %s to the vocabulary", token) added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens)) - added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} + added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} self.added_tokens_encoder.update(added_tok_encoder) self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens)) self.added_tokens_decoder.update(added_tok_decoder) @@ -622,8 +646,10 @@ class PreTrainedTokenizer(object): added_tokens = 0 for key, value in special_tokens_dict.items(): assert key in self.SPECIAL_TOKENS_ATTRIBUTES - if key == 'additional_special_tokens': - assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value) + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all( + isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value + ) added_tokens += self.add_tokens(value) else: assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) @@ -633,7 +659,6 @@ class PreTrainedTokenizer(object): return added_tokens - def tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based @@ -649,14 +674,10 @@ class PreTrainedTokenizer(object): def lowercase_text(t): # convert non-special tokens to lowercase escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens] - pattern = r'(' + r'|'.join(escaped_special_toks) + r')|' + \ - r'(.+?)' - return re.sub( - pattern, - lambda m: m.groups()[0] or m.groups()[1].lower(), - t) + pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" + return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t) - if self.init_kwargs.get('do_lower_case', False): + if self.init_kwargs.get("do_lower_case", False): text = lowercase_text(text) def split_on_token(tok, text): @@ -694,9 +715,14 @@ class PreTrainedTokenizer(object): tokenized_text += [sub_text] text_list = tokenized_text - return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) \ - if token not in self.unique_added_tokens_encoder - else [token] for token in tokenized_text))) + return list( + itertools.chain.from_iterable( + ( + self._tokenize(token, **kwargs) if token not in self.unique_added_tokens_encoder else [token] + for token in tokenized_text + ) + ) + ) added_tokens = self.unique_added_tokens_encoder tokenized_text = split_on_tokens(added_tokens, text) @@ -737,16 +763,18 @@ class PreTrainedTokenizer(object): def _convert_token_to_id(self, token): raise NotImplementedError - def encode(self, - text, - text_pair=None, - add_special_tokens=True, - max_length=None, - stride=0, - truncation_strategy='longest_first', - pad_to_max_length=False, - return_tensors=None, - **kwargs): + def encode( + self, + text, + text_pair=None, + add_special_tokens=True, + max_length=None, + stride=0, + truncation_strategy="longest_first", + pad_to_max_length=False, + return_tensors=None, + **kwargs + ): """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. @@ -781,32 +809,36 @@ class PreTrainedTokenizer(object): or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method """ - encoded_inputs = self.encode_plus(text, - text_pair=text_pair, - max_length=max_length, - add_special_tokens=add_special_tokens, - stride=stride, - truncation_strategy=truncation_strategy, - pad_to_max_length=pad_to_max_length, - return_tensors=return_tensors, - **kwargs) + encoded_inputs = self.encode_plus( + text, + text_pair=text_pair, + max_length=max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + pad_to_max_length=pad_to_max_length, + return_tensors=return_tensors, + **kwargs + ) return encoded_inputs["input_ids"] - def encode_plus(self, - text, - text_pair=None, - add_special_tokens=True, - max_length=None, - stride=0, - truncation_strategy='longest_first', - pad_to_max_length=False, - return_tensors=None, - return_token_type_ids=True, - return_attention_mask=True, - return_overflowing_tokens=False, - return_special_tokens_mask=False, - **kwargs): + def encode_plus( + self, + text, + text_pair=None, + add_special_tokens=True, + max_length=None, + stride=0, + truncation_strategy="longest_first", + pad_to_max_length=False, + return_tensors=None, + return_token_type_ids=True, + return_attention_mask=True, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + **kwargs + ): """ Returns a dictionary containing the encoded sequence or sequence pair and additional informations: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. @@ -874,34 +906,40 @@ class PreTrainedTokenizer(object): elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): return text else: - raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.") + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) first_ids = get_input_ids(text) second_ids = get_input_ids(text_pair) if text_pair is not None else None - return self.prepare_for_model(first_ids, - pair_ids=second_ids, - max_length=max_length, - pad_to_max_length=pad_to_max_length, - add_special_tokens=add_special_tokens, - stride=stride, - truncation_strategy=truncation_strategy, - return_tensors=return_tensors, - return_attention_mask=return_attention_mask, - return_token_type_ids=return_token_type_ids, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask) + return self.prepare_for_model( + first_ids, + pair_ids=second_ids, + max_length=max_length, + pad_to_max_length=pad_to_max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + ) - def batch_encode_plus(self, - batch_text_or_text_pairs=None, - add_special_tokens=False, - max_length=None, - stride=0, - truncation_strategy='longest_first', - return_tensors=None, - return_input_lengths=False, - return_attention_masks=False, - **kwargs): + def batch_encode_plus( + self, + batch_text_or_text_pairs=None, + add_special_tokens=False, + max_length=None, + stride=0, + truncation_strategy="longest_first", + return_tensors=None, + return_input_lengths=False, + return_attention_masks=False, + **kwargs + ): """ Returns a dictionary containing the encoded sequence or sequence pair and additional information: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. @@ -933,12 +971,19 @@ class PreTrainedTokenizer(object): ids, pair_ids = ids_or_pair_ids else: ids, pair_ids = ids_or_pair_ids, None - outputs = self.encode_plus(ids, pair_ids, add_special_tokens=add_special_tokens, max_length=max_length, - stride=stride, truncation_strategy=truncation_strategy, return_tensors=None) + outputs = self.encode_plus( + ids, + pair_ids, + add_special_tokens=add_special_tokens, + max_length=max_length, + stride=stride, + truncation_strategy=truncation_strategy, + return_tensors=None, + ) # Append the non-padded length to the output if return_input_lengths: - outputs['input_len'] = len(outputs['input_ids']) + outputs["input_len"] = len(outputs["input_ids"]) for key, value in outputs.items(): if key not in batch_outputs: @@ -946,11 +991,11 @@ class PreTrainedTokenizer(object): batch_outputs[key].append(value) # Compute longest sequence size - max_seq_len = max(map(len, batch_outputs['input_ids'])) + max_seq_len = max(map(len, batch_outputs["input_ids"])) if return_attention_masks: # Allow the model to not give any special attention to padded input - batch_outputs['attention_mask'] = [[0] * len(v) for v in batch_outputs['input_ids']] + batch_outputs["attention_mask"] = [[0] * len(v) for v in batch_outputs["input_ids"]] if return_tensors is not None: @@ -958,34 +1003,48 @@ class PreTrainedTokenizer(object): for key, value in batch_outputs.items(): padded_value = value - if key != 'input_len': + if key != "input_len": # Padding handle - padded_value = [v + [self.pad_token_id if key == 'input_ids' else 1] * (max_seq_len - len(v)) for v in padded_value] + padded_value = [ + v + [self.pad_token_id if key == "input_ids" else 1] * (max_seq_len - len(v)) + for v in padded_value + ] - if return_tensors == 'tf' and is_tf_available(): + if return_tensors == "tf" and is_tf_available(): batch_outputs[key] = tf.constant(padded_value) - elif return_tensors == 'pt' and is_torch_available(): + elif return_tensors == "pt" and is_torch_available(): batch_outputs[key] = torch.tensor(padded_value) elif return_tensors is not None: - logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)) + logger.warning( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + return_tensors + ) + ) # encoder_attention_mask requires 1 for real token, 0 for padding, just invert value if return_attention_masks: if is_tf_available(): - batch_outputs['attention_mask'] = tf.abs(batch_outputs['attention_mask'] - 1) + batch_outputs["attention_mask"] = tf.abs(batch_outputs["attention_mask"] - 1) else: - batch_outputs['attention_mask'] = torch.abs(batch_outputs['attention_mask'] - 1) + batch_outputs["attention_mask"] = torch.abs(batch_outputs["attention_mask"] - 1) return batch_outputs - def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, - truncation_strategy='longest_first', - pad_to_max_length=False, - return_tensors=None, - return_token_type_ids=True, - return_attention_mask=True, - return_overflowing_tokens=False, - return_special_tokens_mask=False): + def prepare_for_model( + self, + ids, + pair_ids=None, + max_length=None, + add_special_tokens=True, + stride=0, + truncation_strategy="longest_first", + pad_to_max_length=False, + return_tensors=None, + return_token_type_ids=True, + return_attention_mask=True, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + ): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates @@ -1050,10 +1109,13 @@ class PreTrainedTokenizer(object): # Handle max sequence length total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0) if max_length and total_len > max_length: - ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids, - num_tokens_to_remove=total_len-max_length, - truncation_strategy=truncation_strategy, - stride=stride) + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length @@ -1081,54 +1143,64 @@ class PreTrainedTokenizer(object): encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length] if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len: - logger.warning("Token indices sequence length is longer than the specified maximum sequence length " - "for this model ({} > {}). Running this sequence through the model will result in " - "indexing errors".format(len(ids), self.max_len)) - + logger.warning( + "Token indices sequence length is longer than the specified maximum sequence length " + "for this model ({} > {}). Running this sequence through the model will result in " + "indexing errors".format(len(ids), self.max_len) + ) + needs_to_be_padded = pad_to_max_length and ( - max_length and len(encoded_inputs["input_ids"]) < max_length - or - max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000 + max_length + and len(encoded_inputs["input_ids"]) < max_length + or max_length is None + and len(encoded_inputs["input_ids"]) < self.max_len + and self.max_len <= 10000 ) if pad_to_max_length and max_length is None and self.max_len > 10000: - logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.") + logger.warning( + "Sequence can't be padded as no maximum length is specified and the model maximum length is too high." + ) if needs_to_be_padded: difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"]) - if self.padding_side == 'right': + if self.padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference if return_token_type_ids: - encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference - elif self.padding_side == 'left': + elif self.padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) if return_token_type_ids: - encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"] + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] else: raise ValueError("Invalid padding strategy:" + str(self.padding_side)) - + elif return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) # Prepare inputs as tensors if asked - if return_tensors == 'tf' and is_tf_available(): + if return_tensors == "tf" and is_tf_available(): encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]]) encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]]) if "attention_mask" in encoded_inputs: encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]]) - elif return_tensors == 'pt' and is_torch_available(): + elif return_tensors == "pt" and is_torch_available(): encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]]) encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]]) @@ -1137,11 +1209,15 @@ class PreTrainedTokenizer(object): elif return_tensors is not None: logger.warning( "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( - return_tensors)) + return_tensors + ) + ) return encoded_inputs - def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): + def truncate_sequences( + self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0 + ): """Truncates a sequence pair in place to the maximum length. truncation_strategy: string selected in the following options: - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length @@ -1154,7 +1230,7 @@ class PreTrainedTokenizer(object): if num_tokens_to_remove <= 0: return ids, pair_ids, [] - if truncation_strategy == 'longest_first': + if truncation_strategy == "longest_first": overflowing_tokens = [] for _ in range(num_tokens_to_remove): if pair_ids is None or len(ids) > len(pair_ids): @@ -1165,20 +1241,22 @@ class PreTrainedTokenizer(object): window_len = min(len(ids), stride) if window_len > 0: overflowing_tokens = ids[-window_len:] + overflowing_tokens - elif truncation_strategy == 'only_first': + elif truncation_strategy == "only_first": assert len(ids) > num_tokens_to_remove window_len = min(len(ids), stride + num_tokens_to_remove) overflowing_tokens = ids[-window_len:] ids = ids[:-num_tokens_to_remove] - elif truncation_strategy == 'only_second': + elif truncation_strategy == "only_second": assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove window_len = min(len(pair_ids), stride + num_tokens_to_remove) overflowing_tokens = pair_ids[-window_len:] pair_ids = pair_ids[:-num_tokens_to_remove] - elif truncation_strategy == 'do_not_truncate': + elif truncation_strategy == "do_not_truncate": raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.") else: - raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']") + raise ValueError( + "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" + ) return (ids, pair_ids, overflowing_tokens) def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): @@ -1246,7 +1324,7 @@ class PreTrainedTokenizer(object): The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) but we often want to remove sub-word tokenization artifacts at the same time. """ - return ' '.join(self.convert_ids_to_tokens(tokens)) + return " ".join(self.convert_ids_to_tokens(tokens)) def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): """ @@ -1278,7 +1356,7 @@ class PreTrainedTokenizer(object): current_sub_text.append(token) if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) - text = ' '.join(sub_texts) + text = " ".join(sub_texts) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) @@ -1323,7 +1401,17 @@ class PreTrainedTokenizer(object): def clean_up_tokenization(out_string): """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. """ - out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' - ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" - ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") + out_string = ( + out_string.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" do not", " don't") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) return out_string diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index 8def80bec4..9b96b92f23 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for XLM.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import json import logging @@ -32,386 +31,402 @@ from .tokenization_bert import BasicTokenizer logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = { - 'vocab_file': 'vocab.json', - 'merges_file': 'merges.txt', + "vocab_file": "vocab.json", + "merges_file": "merges.txt", } PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json", - 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json", - 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json", - 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json", - 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json", - 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json", - 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json", - 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json", - 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json", - 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json", + "vocab_file": { + "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json", + "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json", + "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json", + "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json", + "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json", + "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json", + "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json", + "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json", + "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json", + "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json", }, - 'merges_file': - { - 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt", - 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", - 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", - 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt", - 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt", - 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt", - 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", - 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", - 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt", - 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt", + "merges_file": { + "xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt", + "xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", + "xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", + "xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt", + "xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt", + "xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt", + "xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", + "xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", + "xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt", + "xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt", }, } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'xlm-mlm-en-2048': 512, - 'xlm-mlm-ende-1024': 512, - 'xlm-mlm-enfr-1024': 512, - 'xlm-mlm-enro-1024': 512, - 'xlm-mlm-tlm-xnli15-1024': 512, - 'xlm-mlm-xnli15-1024': 512, - 'xlm-clm-enfr-1024': 512, - 'xlm-clm-ende-1024': 512, - 'xlm-mlm-17-1280': 512, - 'xlm-mlm-100-1280': 512, + "xlm-mlm-en-2048": 512, + "xlm-mlm-ende-1024": 512, + "xlm-mlm-enfr-1024": 512, + "xlm-mlm-enro-1024": 512, + "xlm-mlm-tlm-xnli15-1024": 512, + "xlm-mlm-xnli15-1024": 512, + "xlm-clm-enfr-1024": 512, + "xlm-clm-ende-1024": 512, + "xlm-mlm-17-1280": 512, + "xlm-mlm-100-1280": 512, } PRETRAINED_INIT_CONFIGURATION = { - 'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True}, - 'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True, - "id2lang": { "0": "de", - "1": "en"}, - "lang2id": { "de": 0, - "en": 1 }}, - 'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True, - "id2lang": { "0": "en", - "1": "fr"}, - "lang2id": { "en": 0, - "fr": 1 }}, - 'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True, - "id2lang": { "0": "en", - "1": "ro"}, - "lang2id": { "en": 0, - "ro": 1 }}, - 'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True, - "id2lang": { "0": "ar", - "1": "bg", - "2": "de", - "3": "el", - "4": "en", - "5": "es", - "6": "fr", - "7": "hi", - "8": "ru", - "9": "sw", - "10": "th", - "11": "tr", - "12": "ur", - "13": "vi", - "14": "zh"}, - "lang2id": { "ar": 0, - "bg": 1, - "de": 2, - "el": 3, - "en": 4, - "es": 5, - "fr": 6, - "hi": 7, - "ru": 8, - "sw": 9, - "th": 10, - "tr": 11, - "ur": 12, - "vi": 13, - "zh": 14 }}, - 'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True, - "id2lang": { "0": "ar", - "1": "bg", - "2": "de", - "3": "el", - "4": "en", - "5": "es", - "6": "fr", - "7": "hi", - "8": "ru", - "9": "sw", - "10": "th", - "11": "tr", - "12": "ur", - "13": "vi", - "14": "zh"}, - "lang2id": { "ar": 0, - "bg": 1, - "de": 2, - "el": 3, - "en": 4, - "es": 5, - "fr": 6, - "hi": 7, - "ru": 8, - "sw": 9, - "th": 10, - "tr": 11, - "ur": 12, - "vi": 13, - "zh": 14 }}, - 'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True, - "id2lang": { "0": "en", - "1": "fr"}, - "lang2id": { "en": 0, - "fr": 1 }}, - 'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True, - "id2lang": { "0": "de", - "1": "en"}, - "lang2id": { "de": 0, - "en": 1 }}, - 'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False, - "id2lang": { - "0": "ar", - "1": "de", - "2": "en", - "3": "es", - "4": "fr", - "5": "hi", - "6": "it", - "7": "ja", - "8": "ko", - "9": "nl", - "10": "pl", - "11": "pt", - "12": "ru", - "13": "sv", - "14": "tr", - "15": "vi", - "16": "zh" - }, - "lang2id": { - "ar": 0, - "de": 1, - "en": 2, - "es": 3, - "fr": 4, - "hi": 5, - "it": 6, - "ja": 7, - "ko": 8, - "nl": 9, - "pl": 10, - "pt": 11, - "ru": 12, - "sv": 13, - "tr": 14, - "vi": 15, - "zh": 16}}, - 'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False, - "id2lang": { - "0": "af", - "1": "als", - "2": "am", - "3": "an", - "4": "ang", - "5": "ar", - "6": "arz", - "7": "ast", - "8": "az", - "9": "bar", - "10": "be", - "11": "bg", - "12": "bn", - "13": "br", - "14": "bs", - "15": "ca", - "16": "ceb", - "17": "ckb", - "18": "cs", - "19": "cy", - "20": "da", - "21": "de", - "22": "el", - "23": "en", - "24": "eo", - "25": "es", - "26": "et", - "27": "eu", - "28": "fa", - "29": "fi", - "30": "fr", - "31": "fy", - "32": "ga", - "33": "gan", - "34": "gl", - "35": "gu", - "36": "he", - "37": "hi", - "38": "hr", - "39": "hu", - "40": "hy", - "41": "ia", - "42": "id", - "43": "is", - "44": "it", - "45": "ja", - "46": "jv", - "47": "ka", - "48": "kk", - "49": "kn", - "50": "ko", - "51": "ku", - "52": "la", - "53": "lb", - "54": "lt", - "55": "lv", - "56": "mk", - "57": "ml", - "58": "mn", - "59": "mr", - "60": "ms", - "61": "my", - "62": "nds", - "63": "ne", - "64": "nl", - "65": "nn", - "66": "no", - "67": "oc", - "68": "pl", - "69": "pt", - "70": "ro", - "71": "ru", - "72": "scn", - "73": "sco", - "74": "sh", - "75": "si", - "76": "simple", - "77": "sk", - "78": "sl", - "79": "sq", - "80": "sr", - "81": "sv", - "82": "sw", - "83": "ta", - "84": "te", - "85": "th", - "86": "tl", - "87": "tr", - "88": "tt", - "89": "uk", - "90": "ur", - "91": "uz", - "92": "vi", - "93": "war", - "94": "wuu", - "95": "yi", - "96": "zh", - "97": "zh_classical", - "98": "zh_min_nan", - "99": "zh_yue" - }, - "lang2id": { - "af": 0, - "als": 1, - "am": 2, - "an": 3, - "ang": 4, - "ar": 5, - "arz": 6, - "ast": 7, - "az": 8, - "bar": 9, - "be": 10, - "bg": 11, - "bn": 12, - "br": 13, - "bs": 14, - "ca": 15, - "ceb": 16, - "ckb": 17, - "cs": 18, - "cy": 19, - "da": 20, - "de": 21, - "el": 22, - "en": 23, - "eo": 24, - "es": 25, - "et": 26, - "eu": 27, - "fa": 28, - "fi": 29, - "fr": 30, - "fy": 31, - "ga": 32, - "gan": 33, - "gl": 34, - "gu": 35, - "he": 36, - "hi": 37, - "hr": 38, - "hu": 39, - "hy": 40, - "ia": 41, - "id": 42, - "is": 43, - "it": 44, - "ja": 45, - "jv": 46, - "ka": 47, - "kk": 48, - "kn": 49, - "ko": 50, - "ku": 51, - "la": 52, - "lb": 53, - "lt": 54, - "lv": 55, - "mk": 56, - "ml": 57, - "mn": 58, - "mr": 59, - "ms": 60, - "my": 61, - "nds": 62, - "ne": 63, - "nl": 64, - "nn": 65, - "no": 66, - "oc": 67, - "pl": 68, - "pt": 69, - "ro": 70, - "ru": 71, - "scn": 72, - "sco": 73, - "sh": 74, - "si": 75, - "simple": 76, - "sk": 77, - "sl": 78, - "sq": 79, - "sr": 80, - "sv": 81, - "sw": 82, - "ta": 83, - "te": 84, - "th": 85, - "tl": 86, - "tr": 87, - "tt": 88, - "uk": 89, - "ur": 90, - "uz": 91, - "vi": 92, - "war": 93, - "wuu": 94, - "yi": 95, - "zh": 96, - "zh_classical": 97, - "zh_min_nan": 98, - "zh_yue": 99 - }}, + "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True}, + "xlm-mlm-ende-1024": { + "do_lowercase_and_remove_accent": True, + "id2lang": {"0": "de", "1": "en"}, + "lang2id": {"de": 0, "en": 1}, + }, + "xlm-mlm-enfr-1024": { + "do_lowercase_and_remove_accent": True, + "id2lang": {"0": "en", "1": "fr"}, + "lang2id": {"en": 0, "fr": 1}, + }, + "xlm-mlm-enro-1024": { + "do_lowercase_and_remove_accent": True, + "id2lang": {"0": "en", "1": "ro"}, + "lang2id": {"en": 0, "ro": 1}, + }, + "xlm-mlm-tlm-xnli15-1024": { + "do_lowercase_and_remove_accent": True, + "id2lang": { + "0": "ar", + "1": "bg", + "2": "de", + "3": "el", + "4": "en", + "5": "es", + "6": "fr", + "7": "hi", + "8": "ru", + "9": "sw", + "10": "th", + "11": "tr", + "12": "ur", + "13": "vi", + "14": "zh", + }, + "lang2id": { + "ar": 0, + "bg": 1, + "de": 2, + "el": 3, + "en": 4, + "es": 5, + "fr": 6, + "hi": 7, + "ru": 8, + "sw": 9, + "th": 10, + "tr": 11, + "ur": 12, + "vi": 13, + "zh": 14, + }, + }, + "xlm-mlm-xnli15-1024": { + "do_lowercase_and_remove_accent": True, + "id2lang": { + "0": "ar", + "1": "bg", + "2": "de", + "3": "el", + "4": "en", + "5": "es", + "6": "fr", + "7": "hi", + "8": "ru", + "9": "sw", + "10": "th", + "11": "tr", + "12": "ur", + "13": "vi", + "14": "zh", + }, + "lang2id": { + "ar": 0, + "bg": 1, + "de": 2, + "el": 3, + "en": 4, + "es": 5, + "fr": 6, + "hi": 7, + "ru": 8, + "sw": 9, + "th": 10, + "tr": 11, + "ur": 12, + "vi": 13, + "zh": 14, + }, + }, + "xlm-clm-enfr-1024": { + "do_lowercase_and_remove_accent": True, + "id2lang": {"0": "en", "1": "fr"}, + "lang2id": {"en": 0, "fr": 1}, + }, + "xlm-clm-ende-1024": { + "do_lowercase_and_remove_accent": True, + "id2lang": {"0": "de", "1": "en"}, + "lang2id": {"de": 0, "en": 1}, + }, + "xlm-mlm-17-1280": { + "do_lowercase_and_remove_accent": False, + "id2lang": { + "0": "ar", + "1": "de", + "2": "en", + "3": "es", + "4": "fr", + "5": "hi", + "6": "it", + "7": "ja", + "8": "ko", + "9": "nl", + "10": "pl", + "11": "pt", + "12": "ru", + "13": "sv", + "14": "tr", + "15": "vi", + "16": "zh", + }, + "lang2id": { + "ar": 0, + "de": 1, + "en": 2, + "es": 3, + "fr": 4, + "hi": 5, + "it": 6, + "ja": 7, + "ko": 8, + "nl": 9, + "pl": 10, + "pt": 11, + "ru": 12, + "sv": 13, + "tr": 14, + "vi": 15, + "zh": 16, + }, + }, + "xlm-mlm-100-1280": { + "do_lowercase_and_remove_accent": False, + "id2lang": { + "0": "af", + "1": "als", + "2": "am", + "3": "an", + "4": "ang", + "5": "ar", + "6": "arz", + "7": "ast", + "8": "az", + "9": "bar", + "10": "be", + "11": "bg", + "12": "bn", + "13": "br", + "14": "bs", + "15": "ca", + "16": "ceb", + "17": "ckb", + "18": "cs", + "19": "cy", + "20": "da", + "21": "de", + "22": "el", + "23": "en", + "24": "eo", + "25": "es", + "26": "et", + "27": "eu", + "28": "fa", + "29": "fi", + "30": "fr", + "31": "fy", + "32": "ga", + "33": "gan", + "34": "gl", + "35": "gu", + "36": "he", + "37": "hi", + "38": "hr", + "39": "hu", + "40": "hy", + "41": "ia", + "42": "id", + "43": "is", + "44": "it", + "45": "ja", + "46": "jv", + "47": "ka", + "48": "kk", + "49": "kn", + "50": "ko", + "51": "ku", + "52": "la", + "53": "lb", + "54": "lt", + "55": "lv", + "56": "mk", + "57": "ml", + "58": "mn", + "59": "mr", + "60": "ms", + "61": "my", + "62": "nds", + "63": "ne", + "64": "nl", + "65": "nn", + "66": "no", + "67": "oc", + "68": "pl", + "69": "pt", + "70": "ro", + "71": "ru", + "72": "scn", + "73": "sco", + "74": "sh", + "75": "si", + "76": "simple", + "77": "sk", + "78": "sl", + "79": "sq", + "80": "sr", + "81": "sv", + "82": "sw", + "83": "ta", + "84": "te", + "85": "th", + "86": "tl", + "87": "tr", + "88": "tt", + "89": "uk", + "90": "ur", + "91": "uz", + "92": "vi", + "93": "war", + "94": "wuu", + "95": "yi", + "96": "zh", + "97": "zh_classical", + "98": "zh_min_nan", + "99": "zh_yue", + }, + "lang2id": { + "af": 0, + "als": 1, + "am": 2, + "an": 3, + "ang": 4, + "ar": 5, + "arz": 6, + "ast": 7, + "az": 8, + "bar": 9, + "be": 10, + "bg": 11, + "bn": 12, + "br": 13, + "bs": 14, + "ca": 15, + "ceb": 16, + "ckb": 17, + "cs": 18, + "cy": 19, + "da": 20, + "de": 21, + "el": 22, + "en": 23, + "eo": 24, + "es": 25, + "et": 26, + "eu": 27, + "fa": 28, + "fi": 29, + "fr": 30, + "fy": 31, + "ga": 32, + "gan": 33, + "gl": 34, + "gu": 35, + "he": 36, + "hi": 37, + "hr": 38, + "hu": 39, + "hy": 40, + "ia": 41, + "id": 42, + "is": 43, + "it": 44, + "ja": 45, + "jv": 46, + "ka": 47, + "kk": 48, + "kn": 49, + "ko": 50, + "ku": 51, + "la": 52, + "lb": 53, + "lt": 54, + "lv": 55, + "mk": 56, + "ml": 57, + "mn": 58, + "mr": 59, + "ms": 60, + "my": 61, + "nds": 62, + "ne": 63, + "nl": 64, + "nn": 65, + "no": 66, + "oc": 67, + "pl": 68, + "pt": 69, + "ro": 70, + "ru": 71, + "scn": 72, + "sco": 73, + "sh": 74, + "si": 75, + "simple": 76, + "sk": 77, + "sl": 78, + "sq": 79, + "sr": 80, + "sv": 81, + "sw": 82, + "ta": 83, + "te": 84, + "th": 85, + "tl": 86, + "tr": 87, + "tt": 88, + "uk": 89, + "ur": 90, + "uz": 91, + "vi": 92, + "war": 93, + "wuu": 94, + "yi": 95, + "zh": 96, + "zh_classical": 97, + "zh_min_nan": 98, + "zh_yue": 99, + }, + }, } + def get_pairs(word): """ Return set of symbol pairs in a word. @@ -430,7 +445,7 @@ def lowercase_and_remove_accent(text): Lowercase and strips accents from a piece of text based on https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py """ - text = ' '.join(text) + text = " ".join(text) text = text.lower() text = unicodedata.normalize("NFD", text) output = [] @@ -439,73 +454,73 @@ def lowercase_and_remove_accent(text): if cat == "Mn": continue output.append(char) - return "".join(output).lower().split(' ') + return "".join(output).lower().split(" ") def replace_unicode_punct(text): - ''' + """ Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl - ''' - text = text.replace(',', ',') - text = re.sub(r'。\s*', '. ', text) - text = text.replace('、', ',') - text = text.replace('”', '"') - text = text.replace('“', '"') - text = text.replace('∶', ':') - text = text.replace(':', ':') - text = text.replace('?', '?') - text = text.replace('《', '"') - text = text.replace('》', '"') - text = text.replace(')', ')') - text = text.replace('!', '!') - text = text.replace('(', '(') - text = text.replace(';', ';') - text = text.replace('1', '"') - text = text.replace('」', '"') - text = text.replace('「', '"') - text = text.replace('0', '0') - text = text.replace('3', '3') - text = text.replace('2', '2') - text = text.replace('5', '5') - text = text.replace('6', '6') - text = text.replace('9', '9') - text = text.replace('7', '7') - text = text.replace('8', '8') - text = text.replace('4', '4') - text = re.sub(r'.\s*', '. ', text) - text = text.replace('~', '~') - text = text.replace('’', '\'') - text = text.replace('…', '...') - text = text.replace('━', '-') - text = text.replace('〈', '<') - text = text.replace('〉', '>') - text = text.replace('【', '[') - text = text.replace('】', ']') - text = text.replace('%', '%') + """ + text = text.replace(",", ",") + text = re.sub(r"。\s*", ". ", text) + text = text.replace("、", ",") + text = text.replace("”", '"') + text = text.replace("“", '"') + text = text.replace("∶", ":") + text = text.replace(":", ":") + text = text.replace("?", "?") + text = text.replace("《", '"') + text = text.replace("》", '"') + text = text.replace(")", ")") + text = text.replace("!", "!") + text = text.replace("(", "(") + text = text.replace(";", ";") + text = text.replace("1", '"') + text = text.replace("」", '"') + text = text.replace("「", '"') + text = text.replace("0", "0") + text = text.replace("3", "3") + text = text.replace("2", "2") + text = text.replace("5", "5") + text = text.replace("6", "6") + text = text.replace("9", "9") + text = text.replace("7", "7") + text = text.replace("8", "8") + text = text.replace("4", "4") + text = re.sub(r".\s*", ". ", text) + text = text.replace("~", "~") + text = text.replace("’", "'") + text = text.replace("…", "...") + text = text.replace("━", "-") + text = text.replace("〈", "<") + text = text.replace("〉", ">") + text = text.replace("【", "[") + text = text.replace("】", "]") + text = text.replace("%", "%") return text def remove_non_printing_char(text): - ''' + """ Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl - ''' + """ output = [] for char in text: cat = unicodedata.category(char) - if cat.startswith('C'): + if cat.startswith("C"): continue output.append(char) return "".join(output) def romanian_preprocessing(text): - '''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`''' + """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`""" # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219") text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b") # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py - text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma - text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma + text = text.replace("\u0218", "S").replace("\u0219", "s") # s-comma + text = text.replace("\u021a", "T").replace("\u021b", "t") # t-comma text = text.replace("\u0102", "A").replace("\u0103", "a") text = text.replace("\u00C2", "A").replace("\u00E2", "a") text = text.replace("\u00CE", "I").replace("\u00EE", "i") @@ -531,33 +546,58 @@ class XLMTokenizer(PreTrainedTokenizer): - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies) """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, merges_file, unk_token="", bos_token="", - sep_token="", pad_token="", cls_token="", - mask_token="", additional_special_tokens=["", - "", "", "", "", "", - "", "", "", ""], - lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True, - **kwargs): - super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token, - sep_token=sep_token, pad_token=pad_token, - cls_token=cls_token, mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - **kwargs) - + def __init__( + self, + vocab_file, + merges_file, + unk_token="", + bos_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + additional_special_tokens=[ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ], + lang2id=None, + id2lang=None, + do_lowercase_and_remove_accent=True, + **kwargs + ): + super(XLMTokenizer, self).__init__( + unk_token=unk_token, + bos_token=bos_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens - self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens # cache of sm.MosesPunctNormalizer instance self.cache_moses_punct_normalizer = dict() # cache of sm.MosesTokenizer instance self.cache_moses_tokenizer = dict() - self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja']) + self.lang_with_custom_tokenizer = set(["zh", "th", "ja"]) # True for current supported model (v1.2.0), False for XLM-17 & 100 self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent self.lang2id = lang2id @@ -570,9 +610,9 @@ class XLMTokenizer(PreTrainedTokenizer): with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) - self.decoder = {v:k for k,v in self.encoder.items()} - with open(merges_file, encoding='utf-8') as merges_handle: - merges = merges_handle.read().split('\n')[:-1] + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[:-1] merges = [tuple(merge.split()[:2]) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} @@ -603,9 +643,14 @@ class XLMTokenizer(PreTrainedTokenizer): if self.ja_word_tokenizer is None: try: import Mykytea - self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~')) + + self.ja_word_tokenizer = Mykytea.Mykytea( + "-model %s/local/share/kytea/model.bin" % os.path.expanduser("~") + ) except (AttributeError, ImportError) as e: - logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps") + logger.error( + "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps" + ) logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") logger.error("2. autoreconf -i") logger.error("3. ./configure --prefix=$HOME/local") @@ -619,16 +664,16 @@ class XLMTokenizer(PreTrainedTokenizer): return len(self.encoder) def bpe(self, token): - word = tuple(token[:-1]) + (token[-1] + '',) + word = tuple(token[:-1]) + (token[-1] + "",) if token in self.cache: return self.cache[token] pairs = get_pairs(word) if not pairs: - return token+'' + return token + "" while True: - bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram @@ -643,8 +688,8 @@ class XLMTokenizer(PreTrainedTokenizer): new_word.extend(word[i:]) break - if word[i] == first and i < len(word)-1 and word[i+1] == second: - new_word.append(first+second) + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) i += 2 else: new_word.append(word[i]) @@ -655,13 +700,13 @@ class XLMTokenizer(PreTrainedTokenizer): break else: pairs = get_pairs(word) - word = ' '.join(word) - if word == '\n ': - word = '\n' + word = " ".join(word) + if word == "\n ": + word = "\n" self.cache[token] = word return word - def _tokenize(self, text, lang='en', bypass_tokenizer=False): + def _tokenize(self, text, lang="en", bypass_tokenizer=False): """ Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses. @@ -697,45 +742,49 @@ class XLMTokenizer(PreTrainedTokenizer): List of tokens. """ if lang and self.lang2id and lang not in self.lang2id: - logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.") + logger.error( + "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model." + ) if bypass_tokenizer: text = text.split() elif lang not in self.lang_with_custom_tokenizer: text = self.moses_pipeline(text, lang=lang) # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step - if lang == 'ro': + if lang == "ro": text = romanian_preprocessing(text) text = self.moses_tokenize(text, lang=lang) - elif lang == 'th': + elif lang == "th": text = self.moses_pipeline(text, lang=lang) try: - if 'pythainlp' not in sys.modules: + if "pythainlp" not in sys.modules: from pythainlp.tokenize import word_tokenize as th_word_tokenize else: - th_word_tokenize = sys.modules['pythainlp'].word_tokenize + th_word_tokenize = sys.modules["pythainlp"].word_tokenize except (AttributeError, ImportError) as e: - logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps") + logger.error( + "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps" + ) logger.error("1. pip install pythainlp") raise e text = th_word_tokenize(text) - elif lang == 'zh': + elif lang == "zh": try: - if 'jieba' not in sys.modules: + if "jieba" not in sys.modules: import jieba else: - jieba = sys.modules['jieba'] + jieba = sys.modules["jieba"] except (AttributeError, ImportError) as e: logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps") logger.error("1. pip install jieba") raise e - text = ' '.join(jieba.cut(text)) + text = " ".join(jieba.cut(text)) text = self.moses_pipeline(text, lang=lang) text = text.split() - elif lang == 'ja': + elif lang == "ja": text = self.moses_pipeline(text, lang=lang) text = self.ja_tokenize(text) else: - raise ValueError('It should not reach here') + raise ValueError("It should not reach here") if self.do_lowercase_and_remove_accent and not bypass_tokenizer: text = lowercase_and_remove_accent(text) @@ -743,7 +792,7 @@ class XLMTokenizer(PreTrainedTokenizer): split_tokens = [] for token in text: if token: - split_tokens.extend([t for t in self.bpe(token).split(' ')]) + split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens @@ -757,7 +806,7 @@ class XLMTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ''.join(tokens).replace('', ' ').strip() + out_string = "".join(tokens).replace("", " ").strip() return out_string def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -792,8 +841,10 @@ class XLMTokenizer(PreTrainedTokenizer): if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: @@ -820,20 +871,22 @@ class XLMTokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) - with open(vocab_file, 'w', encoding='utf-8') as f: + with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) index = 0 with open(merge_file, "w", encoding="utf-8") as writer: for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): if index != token_index: - logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!".format(merge_file)) + logger.warning( + "Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file) + ) index = token_index - writer.write(' '.join(bpe_tokens) + u'\n') + writer.write(" ".join(bpe_tokens) + "\n") index += 1 return vocab_file, merge_file diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py index adbc8cd6c7..30814c3a1d 100644 --- a/transformers/tokenization_xlm_roberta.py +++ b/transformers/tokenization_xlm_roberta.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License """ Tokenization classes for XLM-RoBERTa model.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging import os @@ -26,29 +25,29 @@ from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model", - 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model", - 'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model", - 'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model", - 'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model", - 'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model", + "vocab_file": { + "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model", + "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'xlm-roberta-base': 512, - 'xlm-roberta-large': 512, - 'xlm-roberta-large-finetuned-conll02-dutch': 512, - 'xlm-roberta-large-finetuned-conll02-spanish': 512, - 'xlm-roberta-large-finetuned-conll03-english': 512, - 'xlm-roberta-large-finetuned-conll03-german': 512, + "xlm-roberta-base": 512, + "xlm-roberta-large": 512, + "xlm-roberta-large-finetuned-conll02-dutch": 512, + "xlm-roberta-large-finetuned-conll02-spanish": 512, + "xlm-roberta-large-finetuned-conll03-english": 512, + "xlm-roberta-large-finetuned-conll03-german": 512, } + class XLMRobertaTokenizer(PreTrainedTokenizer): """ Adapted from RobertaTokenizer and XLNetTokenizer @@ -56,17 +55,33 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): - requires `SentencePiece `_ """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, bos_token="", eos_token="", sep_token="", - cls_token="", unk_token="", pad_token='', mask_token='', - **kwargs): - super(XLMRobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, - sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, - mask_token=mask_token, - **kwargs) + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + **kwargs + ): + super(XLMRobertaTokenizer, self).__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.sp_model = spm.SentencePieceProcessor() @@ -85,7 +100,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab self.fairseq_offset = 1 - self.fairseq_tokens_to_ids[''] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -119,8 +134,10 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): """ if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is None: @@ -164,7 +181,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def save_vocabulary(self, save_directory): @@ -174,7 +191,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index a8369df67b..8ea0ccb77b 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ Tokenization classes for XLNet model.""" -from __future__ import (absolute_import, division, print_function, - unicode_literals) +from __future__ import absolute_import, division, print_function, unicode_literals import logging import os @@ -27,51 +26,69 @@ from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) -VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'} +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} PRETRAINED_VOCAB_FILES_MAP = { - 'vocab_file': - { - 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model", - 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model", + "vocab_file": { + "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model", + "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'xlnet-base-cased': None, - 'xlnet-large-cased': None, + "xlnet-base-cased": None, + "xlnet-large-cased": None, } -SPIECE_UNDERLINE = u'▁' +SPIECE_UNDERLINE = "▁" # Segments (not really needed) -SEG_ID_A = 0 -SEG_ID_B = 1 +SEG_ID_A = 0 +SEG_ID_B = 1 SEG_ID_CLS = 2 SEG_ID_SEP = 3 SEG_ID_PAD = 4 + class XLNetTokenizer(PreTrainedTokenizer): """ SentencePiece based tokenizer. Peculiarities: - requires `SentencePiece `_ """ + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES padding_side = "left" - def __init__(self, vocab_file, - do_lower_case=False, remove_space=True, keep_accents=False, - bos_token="", eos_token="", unk_token="", sep_token="", - pad_token="", cls_token="", mask_token="", - additional_special_tokens=["", ""], **kwargs): - super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, - unk_token=unk_token, sep_token=sep_token, - pad_token=pad_token, cls_token=cls_token, - mask_token=mask_token, additional_special_tokens= - additional_special_tokens, **kwargs) + def __init__( + self, + vocab_file, + do_lower_case=False, + remove_space=True, + keep_accents=False, + bos_token="", + eos_token="", + unk_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + additional_special_tokens=["", ""], + **kwargs + ): + super(XLNetTokenizer, self).__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs + ) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens @@ -80,8 +97,10 @@ class XLNetTokenizer(PreTrainedTokenizer): try: import sentencepiece as spm except ImportError: - logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece") + logger.warning( + "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece" + ) self.do_lower_case = do_lower_case self.remove_space = remove_space @@ -105,24 +124,26 @@ class XLNetTokenizer(PreTrainedTokenizer): try: import sentencepiece as spm except ImportError: - logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece") + logger.warning( + "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece" + ) self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): if self.remove_space: - outputs = ' '.join(inputs.strip().split()) + outputs = " ".join(inputs.strip().split()) else: outputs = inputs outputs = outputs.replace("``", '"').replace("''", '"') if six.PY2 and isinstance(outputs, str): - outputs = outputs.decode('utf-8') + outputs = outputs.decode("utf-8") if not self.keep_accents: - outputs = unicodedata.normalize('NFKD', outputs) - outputs = ''.join([c for c in outputs if not unicodedata.combining(c)]) + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) if self.do_lower_case: outputs = outputs.lower() @@ -135,7 +156,7 @@ class XLNetTokenizer(PreTrainedTokenizer): text = self.preprocess_text(text) # note(zhiliny): in some systems, sentencepiece only accepts str for py2 if six.PY2 and isinstance(text, unicode): - text = text.encode('utf-8') + text = text.encode("utf-8") if not sample: pieces = self.sp_model.EncodeAsPieces(text) @@ -143,9 +164,8 @@ class XLNetTokenizer(PreTrainedTokenizer): pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) new_pieces = [] for piece in pieces: - if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit(): - cur_pieces = self.sp_model.EncodeAsPieces( - piece[:-1].replace(SPIECE_UNDERLINE, '')) + if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: if len(cur_pieces[0]) == 1: cur_pieces = cur_pieces[1:] @@ -161,7 +181,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ret_pieces = [] for piece in new_pieces: if isinstance(piece, str): - piece = piece.decode('utf-8') + piece = piece.decode("utf-8") ret_pieces.append(piece) new_pieces = ret_pieces @@ -175,12 +195,12 @@ class XLNetTokenizer(PreTrainedTokenizer): """Converts an index (integer) in a token (string/unicode) using the vocab.""" token = self.sp_model.IdToPiece(index) if six.PY2 and return_unicode and isinstance(token, str): - token = token.decode('utf-8') + token = token.decode("utf-8") return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -215,8 +235,10 @@ class XLNetTokenizer(PreTrainedTokenizer): if already_has_special_tokens: if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) if token_ids_1 is not None: @@ -247,7 +269,7 @@ class XLNetTokenizer(PreTrainedTokenizer): if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py index de8cfa9e73..7262dd7201 100644 --- a/utils/download_glue_data.py +++ b/utils/download_glue_data.py @@ -1,4 +1,4 @@ -''' Script for downloading all GLUE data. +""" Script for downloading all GLUE data. Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e Note: for legal reasons, we are unable to host MRPC. @@ -16,7 +16,7 @@ rm MSRParaphraseCorpus.msi 1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now. 2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray! -''' +""" import os import sys @@ -27,20 +27,23 @@ import urllib.request import zipfile TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"] -TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4', - "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8', - "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc', - "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5', - "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5', - "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce', - "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df', - "QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601', - "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb', - "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf', - "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'} +TASK2PATH = { + "CoLA": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4", + "SST": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8", + "MRPC": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc", + "QQP": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5", + "STS": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5", + "MNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce", + "SNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df", + "QNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601", + "RTE": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb", + "WNLI": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf", + "diagnostic": "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D", +} + +MRPC_TRAIN = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt" +MRPC_TEST = "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt" -MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' -MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' def download_and_extract(task, data_dir): print("Downloading and extracting %s..." % task) @@ -51,6 +54,7 @@ def download_and_extract(task, data_dir): os.remove(data_file) print("\tCompleted!") + def format_mrpc(data_dir, path_to_data): print("Processing MRPC...") mrpc_dir = os.path.join(data_dir, "MRPC") @@ -72,30 +76,32 @@ def format_mrpc(data_dir, path_to_data): dev_ids = [] with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh: for row in ids_fh: - dev_ids.append(row.strip().split('\t')) + dev_ids.append(row.strip().split("\t")) - with open(mrpc_train_file, encoding="utf8") as data_fh, \ - open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \ - open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh: + with open(mrpc_train_file, encoding="utf8") as data_fh, open( + os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8" + ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh: header = data_fh.readline() train_fh.write(header) dev_fh.write(header) for row in data_fh: - label, id1, id2, s1, s2 = row.strip().split('\t') + label, id1, id2, s1, s2 = row.strip().split("\t") if [id1, id2] in dev_ids: dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) else: train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) - with open(mrpc_test_file, encoding="utf8") as data_fh, \ - open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh: + with open(mrpc_test_file, encoding="utf8") as data_fh, open( + os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8" + ) as test_fh: header = data_fh.readline() test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") for idx, row in enumerate(data_fh): - label, id1, id2, s1, s2 = row.strip().split('\t') + label, id1, id2, s1, s2 = row.strip().split("\t") test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2)) print("\tCompleted!") + def download_diagnostic(data_dir): print("Downloading and extracting diagnostic...") if not os.path.isdir(os.path.join(data_dir, "diagnostic")): @@ -105,8 +111,9 @@ def download_diagnostic(data_dir): print("\tCompleted!") return + def get_tasks(task_names): - task_names = task_names.split(',') + task_names = task_names.split(",") if "all" in task_names: tasks = TASKS else: @@ -116,13 +123,19 @@ def get_tasks(task_names): tasks.append(task_name) return tasks + def main(arguments): parser = argparse.ArgumentParser() - parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') - parser.add_argument('--tasks', help='tasks to download data for as a comma separated string', - type=str, default='all') - parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt', - type=str, default='') + parser.add_argument("--data_dir", help="directory to save data to", type=str, default="glue_data") + parser.add_argument( + "--tasks", help="tasks to download data for as a comma separated string", type=str, default="all" + ) + parser.add_argument( + "--path_to_mrpc", + help="path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt", + type=str, + default="", + ) args = parser.parse_args(arguments) if not os.path.isdir(args.data_dir): @@ -130,13 +143,13 @@ def main(arguments): tasks = get_tasks(args.tasks) for task in tasks: - if task == 'MRPC': + if task == "MRPC": format_mrpc(args.data_dir, args.path_to_mrpc) - elif task == 'diagnostic': + elif task == "diagnostic": download_diagnostic(args.data_dir) else: download_and_extract(task, args.data_dir) -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main(sys.argv[1:])) diff --git a/utils/link_tester.py b/utils/link_tester.py index fe3990d28c..0ef165c401 100644 --- a/utils/link_tester.py +++ b/utils/link_tester.py @@ -43,7 +43,7 @@ def scan_code_for_links(source): """ Scans the file to find links using a regular expression. Returns a list of links. """ - with open(source, 'r') as content: + with open(source, "r") as content: content = content.read() raw_links = re.findall(REGEXP_FIND_S3_LINKS, content) links = [prefix + suffix for _, prefix, suffix in raw_links]