diff --git a/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py b/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index c245d0eae5..0000000000 --- a/examples/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,161 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Convert BertExtAbs's checkpoints """ - -import argparse -from collections import namedtuple -import logging -import pdb -import torch - -from models.model_builder import AbsSummarizer # The authors' implementation -from model_bertabs import BertAbsSummarizer - -from transformers import BertTokenizer - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -SAMPLE_TEXT = 'Hello world! cécé herlolip' - - -BertAbsConfig = namedtuple( - "BertAbsConfig", - ["temp_dir", "large", "use_bert_emb", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"], -) - - -def convert_bertabs_checkpoints(path_to_checkpoints, dump_path): - """ Copy/paste and tweak the pre-trained weights provided by the creators - of BertAbs for the internal architecture. - """ - - # Instantiate the authors' model with the pre-trained weights - config = BertAbsConfig( - temp_dir=".", - finetune_bert=False, - large=False, - share_emb=True, - use_bert_emb=False, - encoder="bert", - max_pos=512, - enc_layers=6, - enc_hidden_size=512, - enc_heads=8, - enc_ff_size=512, - enc_dropout=0.2, - dec_layers=6, - dec_hidden_size=768, - dec_heads=8, - dec_ff_size=2048, - dec_dropout=0.2, - ) - checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage) - original = AbsSummarizer(config, torch.device("cpu"), checkpoints) - original.eval() - - new_model = BertAbsSummarizer(config, torch.device("cpu")) - new_model.eval() - - # ------------------- - # Convert the weights - # ------------------- - - logging.info("convert the model") - new_model.encoder.load_state_dict(original.bert.state_dict()) - - new_model.decoder.generator.load_state_dict(original.generator.state_dict()) - new_model.decoder.embeddings.load_state_dict(original.decoder.embeddings.state_dict()) - new_model.decoder.pos_emb.load_state_dict(original.decoder.pos_emb.state_dict()) - new_model.decoder.transformer_layers.load_state_dict(original.decoder.transformer_layers.state_dict()) - new_model.decoder.layer_norm.load_state_dict(original.decoder.layer_norm.state_dict()) - - # ---------------------------------- - # Make sure the outpus are identical - # ---------------------------------- - - logging.info("Make sure that the models' outputs are identical") - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - - # prepare the model inputs - encoder_input_ids = tokenizer.encode("This is sample éàalj'-.") - encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids))) - encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0) - decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.") - decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids))) - decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0) - - # failsafe to make sure the weights reset does not affect the - # loaded weights. - assert torch.max(torch.abs(original.generator[0].weight - new_model.decoder.generator[0].weight)) == 0 - - # forward pass - src = encoder_input_ids - tgt = decoder_input_ids - segs = token_type_ids = None - clss = None - mask_src = encoder_attention_mask = None - mask_tgt = decoder_attention_mask = None - mask_cls = None - - # The original model does not apply the geneator layer immediatly but rather in - # the beam search (where it combines softmax + linear layer). Since we already - # apply the softmax in our generation process we only apply the linear layer here. - # We make sure that the outputs of the full stack are identical - output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0] - output_original_model = original.generator(output_original_model) - - output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0] - output_converted_model = torch.nn.functional.log_softmax(output_converted_model, dim=-1) - - maximum_absolute_difference = torch.max(torch.abs(output_converted_model - output_original_model)).item() - print("Maximum absolute difference beween weights: {:.2f}".format(maximum_absolute_difference)) - - are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3) - if are_identical: - logging.info("all weights are equal up to 1e-3") - else: - raise ValueError("the weights are different. The new model is likely different from the original one.") - - # The model has been saved with torch.save(model) and this is bound to the exact - # directory structure. We save the state_dict instead. - logging.info("saving the model's state dictionary") - torch.save(new_model.state_dict(), "bert-ext-abs.pt") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--bertabs_checkpoint_path", - default=None, - type=str, - required=True, - help="Path the official PyTorch dump.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model.", - ) - args = parser.parse_args() - - convert_bertabs_checkpoints( - args.bertabs_checkpoint_path, - args.pytorch_dump_folder_path, - ) diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py index d989e4fd7e..5bf1599ad2 100644 --- a/examples/summarization/modeling_bertabs.py +++ b/examples/summarization/modeling_bertabs.py @@ -1,6 +1,6 @@ # MIT License -# Copyright (c) 2019 Yang Liu +# Copyright (c) 2019 Yang Liu and the HuggingFace team # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/examples/summarization/requirements.txt b/examples/summarization/requirements.txt new file mode 100644 index 0000000000..36d75a5edc --- /dev/null +++ b/examples/summarization/requirements.txt @@ -0,0 +1,9 @@ +# progress bars in model download and training scripts +tqdm +# Accessing files from S3 directly. +boto3 +# Used for downloading models over HTTP +requests +# For ROUGE +nltk +py-rouge diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index c388569869..f58ce3bb43 100644 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -1,3 +1,4 @@ +#! /usr/bin/python3 import argparse from collections import namedtuple import logging @@ -97,6 +98,32 @@ def evaluate(args): print(str_scores) +def save_summaries(summaries, path, original_document_name): + """ Write the summaries in fies that are prefixed by the original + files' name with the `_summary` appended. + + Attributes: + original_document_names: List[string] + Name of the document that was summarized. + path: string + Path were the summaries will be written + summaries: List[string] + The summaries that we produced. + """ + for summary, document_name in zip(summaries, original_document_name): + # Prepare the summary file's name + if "." in document_name: + bare_document_name = ".".join(document_name.split(".")[:-1]) + extension = document_name.split(".")[-1] + name = bare_document_name + "_summary." + extension + else: + name = document_name + "_summary" + + file_path = os.path.join(path, name) + with open(file_path, "w") as output: + output.write(summary) + + def format_summary(translation): """ Transforms the output of the `from_batch` function into nicely formatted summaries. @@ -151,32 +178,6 @@ def save_rouge_scores(str_scores): output.write(str_scores) -def save_summaries(summaries, path, original_document_name): - """ Write the summaries in fies that are prefixed by the original - files' name with the `_summary` appended. - - Attributes: - original_document_names: List[string] - Name of the document that was summarized. - path: string - Path were the summaries will be written - summaries: List[string] - The summaries that we produced. - """ - for summary, document_name in zip(summaries, original_document_name): - # Prepare the summary file's name - if "." in document_name: - bare_document_name = ".".join(document_name.split(".")[:-1]) - extension = document_name.split(".")[-1] - name = bare_document_name + "_summary." + extension - else: - name = document_name + "_summary" - - file_path = os.path.join(path, name) - with open(file_path, "w") as output: - output.write(summary) - - # # LOAD the dataset # @@ -323,7 +324,7 @@ def main(): raise FileNotFoundError( "We could not find the directory you specified for the documents to summarize, or it was empty. Please specify a valid path." ) - maybe_create_output_dir(args.summaries_output_dir) + os.makedirs(args.summaries_output_dir, exist_ok=True) evaluate(args) @@ -339,10 +340,5 @@ def documents_dir_is_valid(path): return True -def maybe_create_output_dir(path): - if not os.path.exists(path): - os.makedirs(path) - - if __name__ == "__main__": main() diff --git a/requirements.txt b/requirements.txt index 2cbcc3809d..4a3162adce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,3 @@ regex sentencepiece # For XLM sacremoses -# For ROUGE -nltk -py-rouge diff --git a/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 4f158966e1..0000000000 --- a/transformers/convert_bertextabs_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,158 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Convert BertExtAbs's checkpoints """ - -import argparse -from collections import namedtuple -import logging - -import torch - -from models.model_builder import AbsSummarizer # The authors' implementation - -from transformers import BertConfig, Model2Model, BertModel, BertForMaskedLM - - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -BertExtAbsConfig = namedtuple( - "BertExtAbsConfig", - ["temp_dir", "large", "finetune_bert", "encoder", "share_emb", "max_pos", "enc_layers", "enc_hidden_size", "enc_heads", "enc_ff_size", "enc_dropout", "dec_layers", "dec_hidden_size", "dec_heads", "dec_ff_size", "dec_dropout"], -) - - -def convert_bertextabs_checkpoints(path_to_checkpoints, dump_path): - """ Copy/paste and tweak the pre-trained weights provided by the creators - of BertExtAbs for the internal architecture. - """ - - # Load checkpoints in memory - checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage) - - # Instantiate the authors' model with the pre-trained weights - config = BertExtAbsConfig( - temp_dir=".", - finetune_bert=False, - large=False, - share_emb=True, - encoder="bert", - max_pos=512, - enc_layers=6, - enc_hidden_size=512, - enc_heads=8, - enc_ff_size=512, - enc_dropout=0.2, - dec_layers=6, - dec_hidden_size=768, - dec_heads=8, - dec_ff_size=2048, - dec_dropout=0.2, - ) - bertextabs = AbsSummarizer(config, torch.device("cpu"), checkpoints) - bertextabs.eval() - - # Instantiate our version of the model - decoder_config = BertConfig( - hidden_size=config.dec_hidden_size, - num_hidden_layers=config.dec_layers, - num_attention_heads=config.dec_heads, - intermediate_size=config.dec_ff_size, - hidden_dropout_prob=config.dec_dropout, - attention_probs_dropout_prob=config.dec_dropout, - is_decoder=True, - ) - - decoder_model = BertForMaskedLM(decoder_config) - model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder_model) - model.eval() - - # Let us now start the weight copying process - model.encoder.load_state_dict(bertextabs.bert.model.state_dict()) - - # Decoder - - # Embeddings. The positional embeddings are equal to the word embedding plus a modulation - # that is computed at each forward pass. This may be a source of discrepancy. - model.decoder.bert.embeddings.word_embeddings.weight = bertextabs.decoder.embeddings.weight - model.decoder.bert.embeddings.position_embeddings.weight = bertextabs.decoder.embeddings.weight - model.decoder.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(bertextabs.decoder.embeddings.weight) # not defined for BertExtAbs decoder - - # In the original code the LayerNorms are applied twice in the layers, at the beginning and between the - # attention layers. - model.decoder.bert.embeddings.LayerNorm.weight = bertextabs.decoder.transformer_layers[0].layer_norm_1.weight - - for i in range(config.dec_layers): - - # self attention - model.decoder.bert.encoder.layer[i].attention.self.query.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_query.weight - model.decoder.bert.encoder.layer[i].attention.self.key.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_keys.weight - model.decoder.bert.encoder.layer[i].attention.self.value.weight = bertextabs.decoder.transformer_layers[i].self_attn.linear_values.weight - model.decoder.bert.encoder.layer[i].attention.output.dense.weight = bertextabs.decoder.transformer_layers[i].self_attn.final_linear.weight - model.decoder.bert.encoder.layer[i].attention.output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i].layer_norm_2.weight - - # attention - model.decoder.bert.encoder.layer[i].crossattention.self.query.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_query.weight - model.decoder.bert.encoder.layer[i].crossattention.self.key.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_keys.weight - model.decoder.bert.encoder.layer[i].crossattention.self.value.weight = bertextabs.decoder.transformer_layers[i].context_attn.linear_values.weight - model.decoder.bert.encoder.layer[i].crossattention.output.dense.weight = bertextabs.decoder.transformer_layers[i].context_attn.final_linear.weight - model.decoder.bert.encoder.layer[i].crossattention.output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i].feed_forward.layer_norm.weight - - # intermediate - model.decoder.bert.encoder.layer[i].intermediate.dense.weight = bertextabs.decoder.transformer_layers[i].feed_forward.w_1.weight - - # output - model.decoder.bert.encoder.layer[i].output.dense.weight = bertextabs.decoder.transformer_layers[i].feed_forward.w_2.weight - - try: - model.decoder.bert.encoder.layer[i].output.LayerNorm.weight = bertextabs.decoder.transformer_layers[i + 1].layer_norm_1.weight - except IndexError: - model.decoder.bert.encoder.layer[i].output.LayerNorm.weight = bertextabs.decoder.layer_norm.weight - - # LM Head - """ - model.decoder.cls.predictions.transform.dense.weight - model.decoder.cls.predictions.transform.dense.biais - model.decoder.cls.predictions.transform.LayerNorm.weight - model.decoder.cls.predictions.transform.LayerNorm.biais - model.decoder.cls.predictions.decoder.weight - model.decoder.cls.predictions.decoder.biais - model.decoder.cls.predictions.biais.data - """ - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--bertextabs_checkpoint_path", - default=None, - type=str, - required=True, - help="Path the official PyTorch dump.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model.", - ) - args = parser.parse_args() - - convert_bertextabs_checkpoints( - args.bertextabs_checkpoint_path, - args.pytorch_dump_folder_path, - ) diff --git a/transformers/generate/__init__.py b/transformers/generate/__init__.py deleted file mode 100644 index 21ac612155..0000000000 --- a/transformers/generate/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .beam_search import BeamSearch diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index 73322101d3..a884abd0a2 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -117,7 +117,8 @@ class PreTrainedEncoderDecoder(nn.Module): kwargs_common = { argument: value for argument, value in kwargs.items() - if not argument.startswith("encoder_") and not argument.startswith("decoder_") + if not argument.startswith("encoder_") + and not argument.startswith("decoder_") } kwargs_decoder = kwargs_common.copy() kwargs_encoder = kwargs_common.copy() @@ -157,27 +158,14 @@ class PreTrainedEncoderDecoder(nn.Module): return model - def save_pretrained(self, save_directory, model_type="bert"): - """ Save an EncoderDecoder model and its configuration file in a format such + def save_pretrained(self, save_directory): + """ Save a Seq2Seq model and its configuration file in a format such that it can be loaded using `:func:`~transformers.PreTrainedEncoderDecoder.from_pretrained` We save the encoder' and decoder's parameters in two separate directories. - - If we want the weight loader to function we need to preprend the model - type to the directories' names. As far as I know there is no simple way - to infer the type of the model (except maybe by parsing the class' - names, which is not very future-proof). For now, we ask the user to - specify the model type explicitly when saving the weights. """ - encoder_path = os.path.join(save_directory, "{}_encoder".format(model_type)) - if not os.path.exists(encoder_path): - os.makedirs(encoder_path) - self.encoder.save_pretrained(encoder_path) - - decoder_path = os.path.join(save_directory, "{}_decoder".format(model_type)) - if not os.path.exists(decoder_path): - os.makedirs(decoder_path) - self.decoder.save_pretrained(decoder_path) + self.encoder.save_pretrained(os.path.join(save_directory, "encoder")) + self.decoder.save_pretrained(os.path.join(save_directory, "decoder")) def forward(self, encoder_input_ids, decoder_input_ids, **kwargs): """ The forward pass on a seq2eq depends what we are performing: @@ -205,7 +193,8 @@ class PreTrainedEncoderDecoder(nn.Module): kwargs_common = { argument: value for argument, value in kwargs.items() - if not argument.startswith("encoder_") and not argument.startswith("decoder_") + if not argument.startswith("encoder_") + and not argument.startswith("decoder_") } kwargs_decoder = kwargs_common.copy() kwargs_encoder = kwargs_common.copy() @@ -228,7 +217,9 @@ class PreTrainedEncoderDecoder(nn.Module): encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) if encoder_hidden_states is None: encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder) - encoder_hidden_states = encoder_outputs[0] # output the last layer hidden state + encoder_hidden_states = encoder_outputs[ + 0 + ] # output the last layer hidden state else: encoder_outputs = ()