From 83a41d39b39f02b6edf91cdb753e1ed36f3982b4 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 15 Jan 2020 18:33:50 -0500 Subject: [PATCH 1/2] :lipstick: super --- examples/mm-imdb/utils_mmimdb.py | 2 +- examples/pplm/pplm_classification_head.py | 2 +- examples/pplm/run_pplm_discrim_train.py | 2 +- .../summarization/configuration_bertabs.py | 2 +- examples/summarization/modeling_bertabs.py | 14 +++--- src/transformers/configuration_albert.py | 2 +- src/transformers/configuration_bert.py | 2 +- src/transformers/configuration_ctrl.py | 2 +- src/transformers/configuration_distilbert.py | 2 +- src/transformers/configuration_gpt2.py | 2 +- src/transformers/configuration_openai.py | 2 +- src/transformers/configuration_t5.py | 2 +- src/transformers/configuration_transfo_xl.py | 2 +- src/transformers/configuration_xlm.py | 2 +- src/transformers/configuration_xlnet.py | 2 +- src/transformers/modeling_albert.py | 20 ++++---- src/transformers/modeling_bert.py | 44 ++++++++--------- src/transformers/modeling_ctrl.py | 8 ++-- src/transformers/modeling_distilbert.py | 20 ++++---- src/transformers/modeling_encoder_decoder.py | 8 ++-- src/transformers/modeling_gpt2.py | 14 +++--- src/transformers/modeling_mmbt.py | 6 +-- src/transformers/modeling_openai.py | 12 ++--- src/transformers/modeling_roberta.py | 20 ++++---- src/transformers/modeling_t5.py | 20 ++++---- src/transformers/modeling_tf_albert.py | 26 +++++----- src/transformers/modeling_tf_bert.py | 48 +++++++++---------- src/transformers/modeling_tf_ctrl.py | 14 +++--- src/transformers/modeling_tf_distilbert.py | 28 +++++------ src/transformers/modeling_tf_gpt2.py | 14 +++--- src/transformers/modeling_tf_openai.py | 14 +++--- src/transformers/modeling_tf_roberta.py | 20 ++++---- src/transformers/modeling_tf_t5.py | 22 ++++----- src/transformers/modeling_tf_transfo_xl.py | 22 ++++----- .../modeling_tf_transfo_xl_utilities.py | 4 +- src/transformers/modeling_tf_utils.py | 10 ++-- src/transformers/modeling_tf_xlm.py | 18 +++---- src/transformers/modeling_tf_xlnet.py | 26 +++++----- src/transformers/modeling_transfo_xl.py | 14 +++--- .../modeling_transfo_xl_utilities.py | 2 +- src/transformers/modeling_utils.py | 16 +++---- src/transformers/modeling_xlm.py | 18 +++---- src/transformers/modeling_xlnet.py | 20 ++++---- src/transformers/optimization.py | 2 +- src/transformers/optimization_tf.py | 16 +++---- src/transformers/tokenization_albert.py | 2 +- src/transformers/tokenization_bert.py | 4 +- .../tokenization_bert_japanese.py | 2 +- src/transformers/tokenization_camembert.py | 2 +- src/transformers/tokenization_ctrl.py | 2 +- src/transformers/tokenization_gpt2.py | 4 +- src/transformers/tokenization_openai.py | 2 +- src/transformers/tokenization_roberta.py | 2 +- src/transformers/tokenization_t5.py | 2 +- src/transformers/tokenization_transfo_xl.py | 2 +- src/transformers/tokenization_utils.py | 2 +- src/transformers/tokenization_xlm.py | 2 +- src/transformers/tokenization_xlm_roberta.py | 2 +- src/transformers/tokenization_xlnet.py | 2 +- .../adding_a_new_model/configuration_xxx.py | 2 +- .../adding_a_new_model/modeling_tf_xxx.py | 14 +++--- templates/adding_a_new_model/modeling_xxx.py | 12 ++--- .../tests/test_tokenization_xxx.py | 2 +- .../adding_a_new_model/tokenization_xxx.py | 2 +- tests/test_tokenization_albert.py | 2 +- tests/test_tokenization_bert.py | 2 +- tests/test_tokenization_bert_japanese.py | 4 +- tests/test_tokenization_ctrl.py | 2 +- tests/test_tokenization_gpt2.py | 2 +- tests/test_tokenization_openai.py | 2 +- tests/test_tokenization_roberta.py | 2 +- tests/test_tokenization_t5.py | 2 +- tests/test_tokenization_transfo_xl.py | 2 +- tests/test_tokenization_xlm.py | 2 +- tests/test_tokenization_xlnet.py | 2 +- 75 files changed, 328 insertions(+), 328 deletions(-) diff --git a/examples/mm-imdb/utils_mmimdb.py b/examples/mm-imdb/utils_mmimdb.py index aa0460639c..5df0a886ec 100644 --- a/examples/mm-imdb/utils_mmimdb.py +++ b/examples/mm-imdb/utils_mmimdb.py @@ -31,7 +31,7 @@ POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: ( class ImageEncoder(nn.Module): def __init__(self, args): - super(ImageEncoder, self).__init__() + super().__init__() model = torchvision.models.resnet152(pretrained=True) modules = list(model.children())[:-2] self.model = nn.Sequential(*modules) diff --git a/examples/pplm/pplm_classification_head.py b/examples/pplm/pplm_classification_head.py index 05621c3bf2..e85ba608b2 100644 --- a/examples/pplm/pplm_classification_head.py +++ b/examples/pplm/pplm_classification_head.py @@ -5,7 +5,7 @@ class ClassificationHead(torch.nn.Module): """Classification Head for transformer encoders""" def __init__(self, class_size, embed_size): - super(ClassificationHead, self).__init__() + super().__init__() self.class_size = class_size self.embed_size = embed_size # self.mlp1 = torch.nn.Linear(embed_size, embed_size) diff --git a/examples/pplm/run_pplm_discrim_train.py b/examples/pplm/run_pplm_discrim_train.py index 44f6b726d8..ce6f583dc6 100644 --- a/examples/pplm/run_pplm_discrim_train.py +++ b/examples/pplm/run_pplm_discrim_train.py @@ -46,7 +46,7 @@ class Discriminator(torch.nn.Module): """Transformer encoder followed by a Classification Head""" def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"): - super(Discriminator, self).__init__() + super().__init__() self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model) self.embed_size = self.encoder.transformer.config.hidden_size diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py index aa51d63980..c976180b2f 100644 --- a/examples/summarization/configuration_bertabs.py +++ b/examples/summarization/configuration_bertabs.py @@ -80,7 +80,7 @@ class BertAbsConfig(PretrainedConfig): dec_dropout=0.2, **kwargs, ): - super(BertAbsConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.max_pos = max_pos diff --git a/examples/summarization/modeling_bertabs.py b/examples/summarization/modeling_bertabs.py index 4dd89ada88..bad412baac 100644 --- a/examples/summarization/modeling_bertabs.py +++ b/examples/summarization/modeling_bertabs.py @@ -47,7 +47,7 @@ class BertAbsPreTrainedModel(PreTrainedModel): class BertAbs(BertAbsPreTrainedModel): def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None): - super(BertAbs, self).__init__(args) + super().__init__(args) self.args = args self.bert = Bert() @@ -122,7 +122,7 @@ class Bert(nn.Module): """ def __init__(self): - super(Bert, self).__init__() + super().__init__() config = BertConfig.from_pretrained("bert-base-uncased") self.model = BertModel(config) @@ -151,7 +151,7 @@ class TransformerDecoder(nn.Module): """ def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size): - super(TransformerDecoder, self).__init__() + super().__init__() # Basic attributes. self.decoder_type = "transformer" @@ -261,7 +261,7 @@ class PositionalEncoding(nn.Module): pe[:, 0::2] = torch.sin(position.float() * div_term) pe[:, 1::2] = torch.cos(position.float() * div_term) pe = pe.unsqueeze(0) - super(PositionalEncoding, self).__init__() + super().__init__() self.register_buffer("pe", pe) self.dropout = nn.Dropout(p=dropout) self.dim = dim @@ -293,7 +293,7 @@ class TransformerDecoderLayer(nn.Module): """ def __init__(self, d_model, heads, d_ff, dropout): - super(TransformerDecoderLayer, self).__init__() + super().__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) @@ -410,7 +410,7 @@ class MultiHeadedAttention(nn.Module): self.dim_per_head = model_dim // head_count self.model_dim = model_dim - super(MultiHeadedAttention, self).__init__() + super().__init__() self.head_count = head_count self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head) @@ -639,7 +639,7 @@ class PositionwiseFeedForward(nn.Module): """ def __init__(self, d_model, d_ff, dropout=0.1): - super(PositionwiseFeedForward, self).__init__() + super().__init__() self.w_1 = nn.Linear(d_model, d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py index b210960f14..2e8211f442 100644 --- a/src/transformers/configuration_albert.py +++ b/src/transformers/configuration_albert.py @@ -122,7 +122,7 @@ class AlbertConfig(PretrainedConfig): layer_norm_eps=1e-12, **kwargs ): - super(AlbertConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.embedding_size = embedding_size diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py index b568f7e47d..71d190af8a 100644 --- a/src/transformers/configuration_bert.py +++ b/src/transformers/configuration_bert.py @@ -125,7 +125,7 @@ class BertConfig(PretrainedConfig): layer_norm_eps=1e-12, **kwargs ): - super(BertConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py index ea5be7eccd..4daba2a97a 100644 --- a/src/transformers/configuration_ctrl.py +++ b/src/transformers/configuration_ctrl.py @@ -106,7 +106,7 @@ class CTRLConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - super(CTRLConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py index 11f14a85c9..b3386e0ab8 100644 --- a/src/transformers/configuration_distilbert.py +++ b/src/transformers/configuration_distilbert.py @@ -113,7 +113,7 @@ class DistilBertConfig(PretrainedConfig): seq_classif_dropout=0.2, **kwargs ): - super(DistilBertConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.sinusoidal_pos_embds = sinusoidal_pos_embds diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index 1275e56299..7fff0b6c49 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -136,7 +136,7 @@ class GPT2Config(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - super(GPT2Config, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py index 0ba91689e8..d4a965bde1 100644 --- a/src/transformers/configuration_openai.py +++ b/src/transformers/configuration_openai.py @@ -138,7 +138,7 @@ class OpenAIGPTConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - super(OpenAIGPTConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx diff --git a/src/transformers/configuration_t5.py b/src/transformers/configuration_t5.py index c7016baed5..39dd7b4e24 100644 --- a/src/transformers/configuration_t5.py +++ b/src/transformers/configuration_t5.py @@ -77,7 +77,7 @@ class T5Config(PretrainedConfig): initializer_factor=1.0, **kwargs ): - super(T5Config, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.n_positions = n_positions self.d_model = d_model diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py index db210e5a10..ebcc4af4f7 100644 --- a/src/transformers/configuration_transfo_xl.py +++ b/src/transformers/configuration_transfo_xl.py @@ -151,7 +151,7 @@ class TransfoXLConfig(PretrainedConfig): layer_norm_epsilon=1e-5, **kwargs ): - super(TransfoXLConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.cutoffs = [] diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py index f1afc96489..c4d61808d6 100644 --- a/src/transformers/configuration_xlm.py +++ b/src/transformers/configuration_xlm.py @@ -197,7 +197,7 @@ class XLMConfig(PretrainedConfig): ): """Constructs XLMConfig. """ - super(XLMConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.emb_dim = emb_dim self.n_layers = n_layers diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py index 397db1bfbb..42f6a00c5f 100644 --- a/src/transformers/configuration_xlnet.py +++ b/src/transformers/configuration_xlnet.py @@ -159,7 +159,7 @@ class XLNetConfig(PretrainedConfig): ): """Constructs XLNetConfig. """ - super(XLNetConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.d_model = d_model self.n_layer = n_layer diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index 4fae225212..fafa3e6f30 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -167,7 +167,7 @@ class AlbertEmbeddings(BertEmbeddings): """ def __init__(self, config): - super(AlbertEmbeddings, self).__init__(config) + super().__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) @@ -177,7 +177,7 @@ class AlbertEmbeddings(BertEmbeddings): class AlbertAttention(BertSelfAttention): def __init__(self, config): - super(AlbertAttention, self).__init__(config) + super().__init__(config) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads @@ -258,7 +258,7 @@ class AlbertAttention(BertSelfAttention): class AlbertLayer(nn.Module): def __init__(self, config): - super(AlbertLayer, self).__init__() + super().__init__() self.config = config self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -279,7 +279,7 @@ class AlbertLayer(nn.Module): class AlbertLayerGroup(nn.Module): def __init__(self, config): - super(AlbertLayerGroup, self).__init__() + super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -309,7 +309,7 @@ class AlbertLayerGroup(nn.Module): class AlbertTransformer(nn.Module): def __init__(self, config): - super(AlbertTransformer, self).__init__() + super().__init__() self.config = config self.output_attentions = config.output_attentions @@ -471,7 +471,7 @@ class AlbertModel(AlbertPreTrainedModel): base_model_prefix = "albert" def __init__(self, config): - super(AlbertModel, self).__init__(config) + super().__init__(config) self.config = config self.embeddings = AlbertEmbeddings(config) @@ -571,7 +571,7 @@ class AlbertModel(AlbertPreTrainedModel): class AlbertMLMHead(nn.Module): def __init__(self, config): - super(AlbertMLMHead, self).__init__() + super().__init__() self.LayerNorm = nn.LayerNorm(config.embedding_size) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) @@ -619,7 +619,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): """ def __init__(self, config): - super(AlbertForMaskedLM, self).__init__(config) + super().__init__(config) self.albert = AlbertModel(config) self.predictions = AlbertMLMHead(config) @@ -706,7 +706,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): """ def __init__(self, config): - super(AlbertForSequenceClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) @@ -804,7 +804,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): """ def __init__(self, config): - super(AlbertForQuestionAnswering, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.albert = AlbertModel(config) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 48ada95c75..3953437777 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -160,7 +160,7 @@ class BertEmbeddings(nn.Module): """ def __init__(self, config): - super(BertEmbeddings, self).__init__() + super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) @@ -197,7 +197,7 @@ class BertEmbeddings(nn.Module): class BertSelfAttention(nn.Module): def __init__(self, config): - super(BertSelfAttention, self).__init__() + super().__init__() if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " @@ -275,7 +275,7 @@ class BertSelfAttention(nn.Module): class BertSelfOutput(nn.Module): def __init__(self, config): - super(BertSelfOutput, self).__init__() + super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) @@ -289,7 +289,7 @@ class BertSelfOutput(nn.Module): class BertAttention(nn.Module): def __init__(self, config): - super(BertAttention, self).__init__() + super().__init__() self.self = BertSelfAttention(config) self.output = BertSelfOutput(config) self.pruned_heads = set() @@ -335,7 +335,7 @@ class BertAttention(nn.Module): class BertIntermediate(nn.Module): def __init__(self, config): - super(BertIntermediate, self).__init__() + super().__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] @@ -350,7 +350,7 @@ class BertIntermediate(nn.Module): class BertOutput(nn.Module): def __init__(self, config): - super(BertOutput, self).__init__() + super().__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) @@ -364,7 +364,7 @@ class BertOutput(nn.Module): class BertLayer(nn.Module): def __init__(self, config): - super(BertLayer, self).__init__() + super().__init__() self.attention = BertAttention(config) self.is_decoder = config.is_decoder if self.is_decoder: @@ -399,7 +399,7 @@ class BertLayer(nn.Module): class BertEncoder(nn.Module): def __init__(self, config): - super(BertEncoder, self).__init__() + super().__init__() self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) @@ -440,7 +440,7 @@ class BertEncoder(nn.Module): class BertPooler(nn.Module): def __init__(self, config): - super(BertPooler, self).__init__() + super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() @@ -455,7 +455,7 @@ class BertPooler(nn.Module): class BertPredictionHeadTransform(nn.Module): def __init__(self, config): - super(BertPredictionHeadTransform, self).__init__() + super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] @@ -472,7 +472,7 @@ class BertPredictionHeadTransform(nn.Module): class BertLMPredictionHead(nn.Module): def __init__(self, config): - super(BertLMPredictionHead, self).__init__() + super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is @@ -492,7 +492,7 @@ class BertLMPredictionHead(nn.Module): class BertOnlyMLMHead(nn.Module): def __init__(self, config): - super(BertOnlyMLMHead, self).__init__() + super().__init__() self.predictions = BertLMPredictionHead(config) def forward(self, sequence_output): @@ -502,7 +502,7 @@ class BertOnlyMLMHead(nn.Module): class BertOnlyNSPHead(nn.Module): def __init__(self, config): - super(BertOnlyNSPHead, self).__init__() + super().__init__() self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, pooled_output): @@ -512,7 +512,7 @@ class BertOnlyNSPHead(nn.Module): class BertPreTrainingHeads(nn.Module): def __init__(self, config): - super(BertPreTrainingHeads, self).__init__() + super().__init__() self.predictions = BertLMPredictionHead(config) self.seq_relationship = nn.Linear(config.hidden_size, 2) @@ -657,7 +657,7 @@ class BertModel(BertPreTrainedModel): """ def __init__(self, config): - super(BertModel, self).__init__(config) + super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) @@ -864,7 +864,7 @@ class BertForPreTraining(BertPreTrainedModel): """ def __init__(self, config): - super(BertForPreTraining, self).__init__(config) + super().__init__(config) self.bert = BertModel(config) self.cls = BertPreTrainingHeads(config) @@ -954,7 +954,7 @@ class BertForMaskedLM(BertPreTrainedModel): """ def __init__(self, config): - super(BertForMaskedLM, self).__init__(config) + super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) @@ -1053,7 +1053,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): """ def __init__(self, config): - super(BertForNextSentencePrediction, self).__init__(config) + super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyNSPHead(config) @@ -1132,7 +1132,7 @@ class BertForSequenceClassification(BertPreTrainedModel): """ def __init__(self, config): - super(BertForSequenceClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) @@ -1221,7 +1221,7 @@ class BertForMultipleChoice(BertPreTrainedModel): """ def __init__(self, config): - super(BertForMultipleChoice, self).__init__(config) + super().__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) @@ -1308,7 +1308,7 @@ class BertForTokenClassification(BertPreTrainedModel): """ def __init__(self, config): - super(BertForTokenClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) @@ -1406,7 +1406,7 @@ class BertForQuestionAnswering(BertPreTrainedModel): """ def __init__(self, config): - super(BertForQuestionAnswering, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 03e73c3311..3b1d74c34f 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -81,7 +81,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N class MultiHeadAttention(torch.nn.Module): def __init__(self, d_model_size, num_heads, output_attentions=False): - super(MultiHeadAttention, self).__init__() + super().__init__() self.output_attentions = output_attentions self.num_heads = num_heads self.d_model_size = d_model_size @@ -132,7 +132,7 @@ def point_wise_feed_forward_network(d_model_size, dff): class EncoderLayer(torch.nn.Module): def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False): - super(EncoderLayer, self).__init__() + super().__init__() self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions) self.ffn = point_wise_feed_forward_network(d_model_size, dff) @@ -274,7 +274,7 @@ class CTRLModel(CTRLPreTrainedModel): """ def __init__(self, config): - super(CTRLModel, self).__init__(config) + super().__init__(config) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.output_past = config.output_past @@ -481,7 +481,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): """ def __init__(self, config): - super(CTRLLMHeadModel, self).__init__(config) + super().__init__(config) self.transformer = CTRLModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index b084103f9f..1cd0dd7ba3 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -59,7 +59,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out): class Embeddings(nn.Module): def __init__(self, config): - super(Embeddings, self).__init__() + super().__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) if config.sinusoidal_pos_embds: @@ -97,7 +97,7 @@ class Embeddings(nn.Module): class MultiHeadSelfAttention(nn.Module): def __init__(self, config): - super(MultiHeadSelfAttention, self).__init__() + super().__init__() self.n_heads = config.n_heads self.dim = config.dim @@ -195,7 +195,7 @@ class MultiHeadSelfAttention(nn.Module): class FFN(nn.Module): def __init__(self, config): - super(FFN, self).__init__() + super().__init__() self.dropout = nn.Dropout(p=config.dropout) self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) @@ -214,7 +214,7 @@ class FFN(nn.Module): class TransformerBlock(nn.Module): def __init__(self, config): - super(TransformerBlock, self).__init__() + super().__init__() self.n_heads = config.n_heads self.dim = config.dim @@ -266,7 +266,7 @@ class TransformerBlock(nn.Module): class Transformer(nn.Module): def __init__(self, config): - super(Transformer, self).__init__() + super().__init__() self.n_layers = config.n_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -424,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel): """ def __init__(self, config): - super(DistilBertModel, self).__init__(config) + super().__init__(config) self.embeddings = Embeddings(config) # Embeddings self.transformer = Transformer(config) # Encoder @@ -525,7 +525,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): """ def __init__(self, config): - super(DistilBertForMaskedLM, self).__init__(config) + super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -600,7 +600,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): """ def __init__(self, config): - super(DistilBertForSequenceClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) @@ -679,7 +679,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): """ def __init__(self, config): - super(DistilBertForQuestionAnswering, self).__init__(config) + super().__init__(config) self.distilbert = DistilBertModel(config) self.qa_outputs = nn.Linear(config.dim, config.num_labels) @@ -766,7 +766,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): """ def __init__(self, config): - super(DistilBertForTokenClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.distilbert = DistilBertModel(config) diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index 696b0fcad5..c48c6b4b32 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -37,7 +37,7 @@ class PreTrainedEncoderDecoder(nn.Module): """ def __init__(self, encoder, decoder): - super(PreTrainedEncoderDecoder, self).__init__() + super().__init__() self.encoder = encoder self.decoder = decoder @@ -290,7 +290,7 @@ class Model2Model(PreTrainedEncoderDecoder): """ def __init__(self, *args, **kwargs): - super(Model2Model, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self.tie_weights() def tie_weights(self): @@ -321,7 +321,7 @@ class Model2Model(PreTrainedEncoderDecoder): ): raise ValueError("Only the Bert model is currently supported.") - model = super(Model2Model, cls).from_pretrained( + model = super().from_pretrained( encoder_pretrained_model_name_or_path=pretrained_model_name_or_path, decoder_pretrained_model_name_or_path=pretrained_model_name_or_path, *args, @@ -345,5 +345,5 @@ class Model2LSTM(PreTrainedEncoderDecoder): " E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`" ) kwargs["decoder_model"] = torch.nn.LSTM(kwargs.pop("decoder_config")) - model = super(Model2LSTM, cls).from_pretrained(*args, **kwargs) + model = super().from_pretrained(*args, **kwargs) return model diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 679100b84d..cb5f557267 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -101,7 +101,7 @@ def gelu(x): class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): - super(Attention, self).__init__() + super().__init__() self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) @@ -202,7 +202,7 @@ class Attention(nn.Module): class MLP(nn.Module): def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) - super(MLP, self).__init__() + super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) @@ -217,7 +217,7 @@ class MLP(nn.Module): class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): - super(Block, self).__init__() + super().__init__() nx = config.n_embd self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) @@ -249,7 +249,7 @@ class GPT2PreTrainedModel(PreTrainedModel): base_model_prefix = "transformer" def __init__(self, *inputs, **kwargs): - super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs) + super().__init__(*inputs, **kwargs) def _init_weights(self, module): """ Initialize the weights. @@ -355,7 +355,7 @@ class GPT2Model(GPT2PreTrainedModel): """ def __init__(self, config): - super(GPT2Model, self).__init__(config) + super().__init__(config) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.output_past = config.output_past @@ -550,7 +550,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): """ def __init__(self, config): - super(GPT2LMHeadModel, self).__init__(config) + super().__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) @@ -678,7 +678,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): """ def __init__(self, config): - super(GPT2DoubleHeadsModel, self).__init__(config) + super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) diff --git a/src/transformers/modeling_mmbt.py b/src/transformers/modeling_mmbt.py index 8f806f84fb..1df20534e0 100644 --- a/src/transformers/modeling_mmbt.py +++ b/src/transformers/modeling_mmbt.py @@ -33,7 +33,7 @@ class ModalEmbeddings(nn.Module): """ def __init__(self, config, encoder, embeddings): - super(ModalEmbeddings, self).__init__() + super().__init__() self.config = config self.encoder = encoder self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size) @@ -175,7 +175,7 @@ class MMBTModel(nn.Module): """ def __init__(self, config, transformer, encoder): - super(MMBTModel, self).__init__() + super().__init__() self.config = config self.transformer = transformer self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings) @@ -359,7 +359,7 @@ class MMBTForClassification(nn.Module): """ def __init__(self, config, transformer, encoder): - super(MMBTForClassification, self).__init__() + super().__init__() self.num_labels = config.num_labels self.mmbt = MMBTModel(config, transformer, encoder) diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index ec0f09d158..b2b3872181 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -127,7 +127,7 @@ ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu} class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): - super(Attention, self).__init__() + super().__init__() n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 @@ -221,7 +221,7 @@ class Attention(nn.Module): class MLP(nn.Module): def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) - super(MLP, self).__init__() + super().__init__() nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) @@ -236,7 +236,7 @@ class MLP(nn.Module): class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): - super(Block, self).__init__() + super().__init__() nx = config.n_embd self.attn = Attention(nx, n_ctx, config, scale) self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) @@ -359,7 +359,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): """ def __init__(self, config): - super(OpenAIGPTModel, self).__init__(config) + super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -518,7 +518,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): """ def __init__(self, config): - super(OpenAIGPTLMHeadModel, self).__init__(config) + super().__init__(config) self.transformer = OpenAIGPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) @@ -623,7 +623,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): """ def __init__(self, config): - super(OpenAIGPTDoubleHeadsModel, self).__init__(config) + super().__init__(config) config.num_labels = 1 self.transformer = OpenAIGPTModel(config) diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index fc066cc7b8..27ef7bf270 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -45,7 +45,7 @@ class RobertaEmbeddings(BertEmbeddings): """ def __init__(self, config): - super(RobertaEmbeddings, self).__init__(config) + super().__init__(config) self.padding_idx = 1 self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) self.position_embeddings = nn.Embedding( @@ -60,7 +60,7 @@ class RobertaEmbeddings(BertEmbeddings): else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - return super(RobertaEmbeddings, self).forward( + return super().forward( input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds ) @@ -204,7 +204,7 @@ class RobertaModel(BertModel): base_model_prefix = "roberta" def __init__(self, config): - super(RobertaModel, self).__init__(config) + super().__init__(config) self.embeddings = RobertaEmbeddings(config) self.init_weights() @@ -254,7 +254,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): base_model_prefix = "roberta" def __init__(self, config): - super(RobertaForMaskedLM, self).__init__(config) + super().__init__(config) self.roberta = RobertaModel(config) self.lm_head = RobertaLMHead(config) @@ -299,7 +299,7 @@ class RobertaLMHead(nn.Module): """Roberta Head for masked language modeling.""" def __init__(self, config): - super(RobertaLMHead, self).__init__() + super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -362,7 +362,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): base_model_prefix = "roberta" def __init__(self, config): - super(RobertaForSequenceClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) @@ -484,7 +484,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): base_model_prefix = "roberta" def __init__(self, config): - super(RobertaForMultipleChoice, self).__init__(config) + super().__init__(config) self.roberta = RobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) @@ -571,7 +571,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): base_model_prefix = "roberta" def __init__(self, config): - super(RobertaForTokenClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) @@ -625,7 +625,7 @@ class RobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): - super(RobertaClassificationHead, self).__init__() + super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) @@ -684,7 +684,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): base_model_prefix = "roberta" def __init__(self, config): - super(RobertaForQuestionAnswering, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index fb37f6fa4e..59c544daea 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -142,7 +142,7 @@ class T5LayerNorm(nn.Module): """ Construct a layernorm module in the T5 style No bias and no substraction of mean. """ - super(T5LayerNorm, self).__init__() + super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps @@ -154,7 +154,7 @@ class T5LayerNorm(nn.Module): class T5DenseReluDense(nn.Module): def __init__(self, config): - super(T5DenseReluDense, self).__init__() + super().__init__() self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) self.dropout = nn.Dropout(config.dropout_rate) @@ -169,7 +169,7 @@ class T5DenseReluDense(nn.Module): class T5LayerFF(nn.Module): def __init__(self, config): - super(T5LayerFF, self).__init__() + super().__init__() self.DenseReluDense = T5DenseReluDense(config) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -185,7 +185,7 @@ class T5Attention(nn.Module): NEW_ID = itertools.count() def __init__(self, config, has_relative_attention_bias=False): - super(T5Attention, self).__init__() + super().__init__() self.layer_id = next(T5Attention.NEW_ID) self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -363,7 +363,7 @@ class T5Attention(nn.Module): class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): - super(T5LayerSelfAttention, self).__init__() + super().__init__() self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -381,7 +381,7 @@ class T5LayerSelfAttention(nn.Module): class T5LayerCrossAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): - super(T5LayerCrossAttention, self).__init__() + super().__init__() self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -399,7 +399,7 @@ class T5LayerCrossAttention(nn.Module): class T5Block(nn.Module): def __init__(self, config, has_relative_attention_bias=False): - super(T5Block, self).__init__() + super().__init__() self.is_decoder = config.is_decoder self.layer = nn.ModuleList() self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) @@ -501,7 +501,7 @@ class T5PreTrainedModel(PreTrainedModel): class T5Stack(T5PreTrainedModel): def __init__(self, config): - super(T5Stack, self).__init__(config) + super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.is_decoder = config.is_decoder @@ -724,7 +724,7 @@ class T5Model(T5PreTrainedModel): """ def __init__(self, config): - super(T5Model, self).__init__(config) + super().__init__(config) self.shared = nn.Embedding(config.vocab_size, config.d_model) encoder_config = copy.deepcopy(config) @@ -830,7 +830,7 @@ class T5WithLMHeadModel(T5PreTrainedModel): """ def __init__(self, config): - super(T5WithLMHeadModel, self).__init__(config) + super().__init__(config) self.model_dim = config.d_model self.shared = nn.Embedding(config.vocab_size, config.d_model) diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index a560b5f52e..8b8cb71efd 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -45,7 +45,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): """ def __init__(self, config, **kwargs): - super(TFAlbertEmbeddings, self).__init__(**kwargs) + super().__init__(**kwargs) self.config = config self.position_embeddings = tf.keras.layers.Embedding( @@ -76,7 +76,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): shape=[self.config.vocab_size, self.config.embedding_size], initializer=get_initializer(self.config.initializer_range), ) - super(TFAlbertEmbeddings, self).build(input_shape) + super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. @@ -141,7 +141,7 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): class TFAlbertSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFAlbertSelfAttention, self).__init__(**kwargs) + super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " @@ -217,7 +217,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer): class TFAlbertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFAlbertSelfOutput, self).__init__(**kwargs) + super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -235,7 +235,7 @@ class TFAlbertSelfOutput(tf.keras.layers.Layer): class TFAlbertAttention(TFBertSelfAttention): def __init__(self, config, **kwargs): - super(TFAlbertAttention, self).__init__(config, **kwargs) + super().__init__(config, **kwargs) self.hidden_size = config.hidden_size self.dense = tf.keras.layers.Dense( @@ -303,7 +303,7 @@ class TFAlbertAttention(TFBertSelfAttention): class TFAlbertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFAlbertLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.attention = TFAlbertAttention(config, name="attention") self.ffn = tf.keras.layers.Dense( @@ -341,7 +341,7 @@ class TFAlbertLayer(tf.keras.layers.Layer): class TFAlbertLayerGroup(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFAlbertLayerGroup, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -376,7 +376,7 @@ class TFAlbertLayerGroup(tf.keras.layers.Layer): class TFAlbertTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFAlbertTransformer, self).__init__(**kwargs) + super().__init__(**kwargs) self.config = config self.output_attentions = config.output_attentions @@ -445,7 +445,7 @@ class TFAlbertPreTrainedModel(TFPreTrainedModel): class TFAlbertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): - super(TFAlbertMLMHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dense = tf.keras.layers.Dense( @@ -467,7 +467,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer): self.decoder_bias = self.add_weight( shape=(self.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" ) - super(TFAlbertMLMHead, self).build(input_shape) + super().build(input_shape) def call(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -596,7 +596,7 @@ class TFAlbertModel(TFAlbertPreTrainedModel): """ def __init__(self, config, **kwargs): - super(TFAlbertModel, self).__init__(config, **kwargs) + super().__init__(config, **kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFAlbertEmbeddings(config, name="embeddings") @@ -733,7 +733,7 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFAlbertForMaskedLM, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.albert = TFAlbertModel(config, name="albert") self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions") @@ -786,7 +786,7 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFAlbertForSequenceClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.albert = TFAlbertModel(config, name="albert") diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 8caac12309..6c748b8978 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -93,7 +93,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): """ def __init__(self, config, **kwargs): - super(TFBertEmbeddings, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size self.initializer_range = config.initializer_range @@ -126,7 +126,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range), ) - super(TFBertEmbeddings, self).build(input_shape) + super().build(input_shape) def call(self, inputs, mode="embedding", training=False): """Get token embeddings of inputs. @@ -193,7 +193,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): class TFBertSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertSelfAttention, self).__init__(**kwargs) + super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " @@ -269,7 +269,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): class TFBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertSelfOutput, self).__init__(**kwargs) + super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -287,7 +287,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer): class TFBertAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.self_attention = TFBertSelfAttention(config, name="self") self.dense_output = TFBertSelfOutput(config, name="output") @@ -305,7 +305,7 @@ class TFBertAttention(tf.keras.layers.Layer): class TFBertIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertIntermediate, self).__init__(**kwargs) + super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -322,7 +322,7 @@ class TFBertIntermediate(tf.keras.layers.Layer): class TFBertOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertOutput, self).__init__(**kwargs) + super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -340,7 +340,7 @@ class TFBertOutput(tf.keras.layers.Layer): class TFBertLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.attention = TFBertAttention(config, name="attention") self.intermediate = TFBertIntermediate(config, name="intermediate") self.bert_output = TFBertOutput(config, name="output") @@ -358,7 +358,7 @@ class TFBertLayer(tf.keras.layers.Layer): class TFBertEncoder(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertEncoder, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] @@ -392,7 +392,7 @@ class TFBertEncoder(tf.keras.layers.Layer): class TFBertPooler(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertPooler, self).__init__(**kwargs) + super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), @@ -410,7 +410,7 @@ class TFBertPooler(tf.keras.layers.Layer): class TFBertPredictionHeadTransform(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertPredictionHeadTransform, self).__init__(**kwargs) + super().__init__(**kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" ) @@ -429,7 +429,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer): class TFBertLMPredictionHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): - super(TFBertLMPredictionHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size self.transform = TFBertPredictionHeadTransform(config, name="transform") @@ -439,7 +439,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") - super(TFBertLMPredictionHead, self).build(input_shape) + super().build(input_shape) def call(self, hidden_states): hidden_states = self.transform(hidden_states) @@ -450,7 +450,7 @@ class TFBertLMPredictionHead(tf.keras.layers.Layer): class TFBertMLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): - super(TFBertMLMHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") def call(self, sequence_output): @@ -460,7 +460,7 @@ class TFBertMLMHead(tf.keras.layers.Layer): class TFBertNSPHead(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertNSPHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.seq_relationship = tf.keras.layers.Dense( 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" ) @@ -472,7 +472,7 @@ class TFBertNSPHead(tf.keras.layers.Layer): class TFBertMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFBertMainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFBertEmbeddings(config, name="embeddings") @@ -707,7 +707,7 @@ class TFBertModel(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") def call(self, inputs, **kwargs): @@ -750,7 +750,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertForPreTraining, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") @@ -803,7 +803,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertForMaskedLM, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") @@ -854,7 +854,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertForNextSentencePrediction, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.nsp = TFBertNSPHead(config, name="nsp___cls") @@ -903,7 +903,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") @@ -960,7 +960,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertForMultipleChoice, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.bert = TFBertMainLayer(config, name="bert") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) @@ -1064,7 +1064,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertForTokenClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") @@ -1121,7 +1121,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.bert = TFBertMainLayer(config, name="bert") diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index 72f187f68e..ae9a370bec 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -75,7 +75,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N class TFMultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs): - super(TFMultiHeadAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = output_attentions self.num_heads = num_heads self.d_model_size = d_model_size @@ -132,7 +132,7 @@ class TFEncoderLayer(tf.keras.layers.Layer): def __init__( self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs ): - super(TFEncoderLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.multi_head_attention = TFMultiHeadAttention( d_model_size, num_heads, output_attentions, name="multi_head_attention" @@ -166,7 +166,7 @@ class TFEncoderLayer(tf.keras.layers.Layer): class TFCTRLMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFCTRLMainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.output_past = config.output_past @@ -443,7 +443,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFCTRLModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFCTRLMainLayer(config, name="transformer") def call(self, inputs, **kwargs): @@ -453,7 +453,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): class TFCTRLLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): - super(TFCTRLLMHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is @@ -462,7 +462,7 @@ class TFCTRLLMHead(tf.keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") - super(TFCTRLLMHead, self).build(input_shape) + super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") @@ -508,7 +508,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFCTRLLMHeadModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFCTRLMainLayer(config, name="transformer") self.lm_head = TFCTRLLMHead(config, self.transformer.w, name="lm_head") diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index 3118411d89..7792202be0 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -65,7 +65,7 @@ def gelu_new(x): class TFEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFEmbeddings, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dim = config.dim self.initializer_range = config.initializer_range @@ -92,7 +92,7 @@ class TFEmbeddings(tf.keras.layers.Layer): self.word_embeddings = self.add_weight( "weight", shape=[self.vocab_size, self.dim], initializer=get_initializer(self.initializer_range) ) - super(TFEmbeddings, self).build(input_shape) + super().build(input_shape) def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): """Get token embeddings of inputs. @@ -169,7 +169,7 @@ class TFEmbeddings(tf.keras.layers.Layer): class TFMultiHeadSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFMultiHeadSelfAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim @@ -259,7 +259,7 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer): class TFFFN(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFFFN, self).__init__(**kwargs) + super().__init__(**kwargs) self.dropout = tf.keras.layers.Dropout(config.dropout) self.lin1 = tf.keras.layers.Dense( config.hidden_dim, kernel_initializer=get_initializer(config.initializer_range), name="lin1" @@ -284,7 +284,7 @@ class TFFFN(tf.keras.layers.Layer): class TFTransformerBlock(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFTransformerBlock, self).__init__(**kwargs) + super().__init__(**kwargs) self.n_heads = config.n_heads self.dim = config.dim @@ -338,7 +338,7 @@ class TFTransformerBlock(tf.keras.layers.Layer): class TFTransformer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFTransformer, self).__init__(**kwargs) + super().__init__(**kwargs) self.n_layers = config.n_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -399,7 +399,7 @@ class TFTransformer(tf.keras.layers.Layer): class TFDistilBertMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFDistilBertMainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.num_hidden_layers = config.num_hidden_layers self.embeddings = TFEmbeddings(config, name="embeddings") # Embeddings @@ -569,7 +569,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFDistilBertModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") # Embeddings def call(self, inputs, **kwargs): @@ -579,7 +579,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): class TFDistilBertLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): - super(TFDistilBertLMHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is @@ -588,7 +588,7 @@ class TFDistilBertLMHead(tf.keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") - super(TFDistilBertLMHead, self).build(input_shape) + super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") @@ -628,7 +628,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFDistilBertForMaskedLM, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.vocab_size = config.vocab_size @@ -690,7 +690,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFDistilBertForSequenceClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") @@ -747,7 +747,7 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFDistilBertForTokenClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.distilbert = TFDistilBertMainLayer(config, name="distilbert") @@ -804,7 +804,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFDistilBertForQuestionAnswering, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.distilbert = TFDistilBertMainLayer(config, name="distilbert") self.qa_outputs = tf.keras.layers.Dense( diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index a1b38c6f7d..2110ac7351 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -58,7 +58,7 @@ def gelu(x): class TFAttention(tf.keras.layers.Layer): def __init__(self, nx, n_ctx, config, scale=False, **kwargs): - super(TFAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) @@ -157,7 +157,7 @@ class TFAttention(tf.keras.layers.Layer): class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): - super(TFMLP, self).__init__(**kwargs) + super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") @@ -173,7 +173,7 @@ class TFMLP(tf.keras.layers.Layer): class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): - super(TFBlock, self).__init__(**kwargs) + super().__init__(**kwargs) nx = config.n_embd self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") @@ -198,7 +198,7 @@ class TFBlock(tf.keras.layers.Layer): class TFGPT2MainLayer(tf.keras.layers.Layer): def __init__(self, config, *inputs, **kwargs): - super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.num_hidden_layers = config.n_layer @@ -475,7 +475,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFGPT2Model, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name="transformer") def call(self, inputs, **kwargs): @@ -521,7 +521,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name="transformer") def get_output_embeddings(self): @@ -598,7 +598,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) config.num_labels = 1 self.transformer = TFGPT2MainLayer(config, name="transformer") self.multiple_choice_head = TFSequenceSummary( diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index 4a17aa01bf..286f11f467 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -66,7 +66,7 @@ ACT_FNS = { class TFAttention(tf.keras.layers.Layer): def __init__(self, nx, n_ctx, config, scale=False, **kwargs): - super(TFAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions n_state = nx # in Attention: n_state=768 (nx=n_embd) @@ -160,7 +160,7 @@ class TFAttention(tf.keras.layers.Layer): class TFMLP(tf.keras.layers.Layer): def __init__(self, n_state, config, **kwargs): - super(TFMLP, self).__init__(**kwargs) + super().__init__(**kwargs) nx = config.n_embd self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc") self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj") @@ -176,7 +176,7 @@ class TFMLP(tf.keras.layers.Layer): class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): - super(TFBlock, self).__init__(**kwargs) + super().__init__(**kwargs) nx = config.n_embd self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") @@ -199,7 +199,7 @@ class TFBlock(tf.keras.layers.Layer): class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): def __init__(self, config, *inputs, **kwargs): - super(TFOpenAIGPTMainLayer, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.output_hidden_states = config.output_hidden_states self.output_attentions = config.output_attentions self.num_hidden_layers = config.n_layer @@ -453,7 +453,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") def call(self, inputs, **kwargs): @@ -494,7 +494,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") def get_output_embeddings(self): @@ -563,7 +563,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) config.num_labels = 1 self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") self.multiple_choice_head = TFSequenceSummary( diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index 75241c5a35..2821236f57 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -42,7 +42,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): """ def __init__(self, config, **kwargs): - super(TFRobertaEmbeddings, self).__init__(config, **kwargs) + super().__init__(config, **kwargs) self.padding_idx = 1 def create_position_ids_from_input_ids(self, x): @@ -78,7 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - return super(TFRobertaEmbeddings, self)._embedding( + return super()._embedding( [input_ids, position_ids, token_type_ids, inputs_embeds], training=training ) @@ -89,7 +89,7 @@ class TFRobertaMainLayer(TFBertMainLayer): """ def __init__(self, config, **kwargs): - super(TFRobertaMainLayer, self).__init__(config, **kwargs) + super().__init__(config, **kwargs) self.embeddings = TFRobertaEmbeddings(config, name="embeddings") def get_input_embeddings(self): @@ -234,7 +234,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFRobertaModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") def call(self, inputs, **kwargs): @@ -246,7 +246,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): """Roberta Head for masked language modeling.""" def __init__(self, config, input_embeddings, **kwargs): - super(TFRobertaLMHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" @@ -260,7 +260,7 @@ class TFRobertaLMHead(tf.keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") - super(TFRobertaLMHead, self).build(input_shape) + super().build(input_shape) def call(self, features): x = self.dense(features) @@ -305,7 +305,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.roberta = TFRobertaMainLayer(config, name="roberta") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") @@ -328,7 +328,7 @@ class TFRobertaClassificationHead(tf.keras.layers.Layer): """Head for sentence-level classification tasks.""" def __init__(self, config, **kwargs): - super(TFRobertaClassificationHead, self).__init__(config, **kwargs) + super().__init__(config, **kwargs) self.dense = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), @@ -383,7 +383,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") @@ -433,7 +433,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.roberta = TFRobertaMainLayer(config, name="roberta") diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 89430b8cab..db62e784b1 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -50,13 +50,13 @@ class TFT5LayerNorm(tf.keras.layers.Layer): """ Construct a layernorm module in the T5 style No bias and no substraction of mean. """ - super(TFT5LayerNorm, self).__init__(**kwargs) + super().__init__(**kwargs) self.variance_epsilon = epsilon def build(self, input_shape): """Build shared word embedding layer """ self.weight = self.add_weight("weight", shape=(input_shape[-1],), initializer="ones") - super(TFT5LayerNorm, self).build(input_shape) + super().build(input_shape) def call(self, x): variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True) @@ -66,7 +66,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer): class TFT5DenseReluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFT5DenseReluDense, self).__init__(**kwargs) + super().__init__(**kwargs) self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name="wi") self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name="wo") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) @@ -82,7 +82,7 @@ class TFT5DenseReluDense(tf.keras.layers.Layer): class TFT5LayerFF(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFT5LayerFF, self).__init__(**kwargs) + super().__init__(**kwargs) self.DenseReluDense = TFT5DenseReluDense(config, name="DenseReluDense") self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm") self.dropout = tf.keras.layers.Dropout(config.dropout_rate) @@ -98,7 +98,7 @@ class TFT5Attention(tf.keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, config, has_relative_attention_bias=False, **kwargs): - super(TFT5Attention, self).__init__(**kwargs) + super().__init__(**kwargs) self.layer_id = next(TFT5Attention.NEW_ID) self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias @@ -259,7 +259,7 @@ class TFT5Attention(tf.keras.layers.Layer): class TFT5LayerSelfAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): - super(TFT5LayerSelfAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.SelfAttention = TFT5Attention( config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention" ) @@ -279,7 +279,7 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer): class TFT5LayerCrossAttention(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): - super(TFT5LayerCrossAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.EncDecAttention = TFT5Attention( config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention" ) @@ -299,7 +299,7 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer): class TFT5Block(tf.keras.layers.Layer): def __init__(self, config, has_relative_attention_bias=False, **kwargs): - super(TFT5Block, self).__init__(**kwargs) + super().__init__(**kwargs) self.is_decoder = config.is_decoder self.layer = [] self.layer.append( @@ -361,7 +361,7 @@ class TFT5Block(tf.keras.layers.Layer): #################################################### class TFT5MainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFT5MainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.is_decoder = config.is_decoder @@ -633,7 +633,7 @@ class TFT5Model(TFT5PreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFT5Model, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") encoder_config = copy.deepcopy(config) @@ -724,7 +724,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.model_dim = config.d_model self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, name="shared") diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py index a3530c6ca7..91a4c36296 100644 --- a/src/transformers/modeling_tf_transfo_xl.py +++ b/src/transformers/modeling_tf_transfo_xl.py @@ -36,7 +36,7 @@ TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { class TFPositionalEmbedding(tf.keras.layers.Layer): def __init__(self, demb, **kwargs): - super(TFPositionalEmbedding, self).__init__(**kwargs) + super().__init__(**kwargs) self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb)) @@ -52,7 +52,7 @@ class TFPositionalEmbedding(tf.keras.layers.Layer): class TFPositionwiseFF(tf.keras.layers.Layer): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs): - super(TFPositionwiseFF, self).__init__(**kwargs) + super().__init__(**kwargs) self.d_model = d_model self.d_inner = d_inner @@ -112,7 +112,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): init_std=0.02, **kwargs ): - super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = output_attentions self.n_head = n_head @@ -155,7 +155,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): self.r_w_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias" ) - super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape) + super().build(input_shape) def _rel_shift(self, x): x_size = shape_list(x) @@ -267,7 +267,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): init_std=0.02, **kwargs ): - super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.dec_attn = TFRelPartialLearnableMultiHeadAttn( n_head, @@ -308,7 +308,7 @@ class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer): class TFAdaptiveEmbedding(tf.keras.layers.Layer): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs): - super(TFAdaptiveEmbedding, self).__init__(**kwargs) + super().__init__(**kwargs) self.n_token = n_token self.d_embed = d_embed @@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): name="emb_projs_._{}".format(i), ) ) - super(TFAdaptiveEmbedding, self).build(input_shape) + super().build(input_shape) def call(self, inp): if self.div_val == 1: @@ -380,7 +380,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): class TFTransfoXLMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFTransfoXLMainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -455,7 +455,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): self.r_r_bias = self.add_weight( shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias" ) - super(TFTransfoXLMainLayer, self).build(input_shape) + super().build(input_shape) def get_input_embeddings(self): return self.word_emb @@ -728,7 +728,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFTransfoXLMainLayer(config, name="transformer") def call(self, inputs, **kwargs): @@ -774,7 +774,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): """ def __init__(self, config): - super(TFTransfoXLLMHeadModel, self).__init__(config) + super().__init__(config) self.transformer = TFTransfoXLMainLayer(config, name="transformer") self.sample_softmax = config.sample_softmax # use sampled softmax diff --git a/src/transformers/modeling_tf_transfo_xl_utilities.py b/src/transformers/modeling_tf_transfo_xl_utilities.py index 23ffb639f7..1f6edf3a9b 100644 --- a/src/transformers/modeling_tf_transfo_xl_utilities.py +++ b/src/transformers/modeling_tf_transfo_xl_utilities.py @@ -24,7 +24,7 @@ from .modeling_tf_utils import shape_list class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): - super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.d_embed = d_embed @@ -98,7 +98,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): name="out_layers_._{}_._bias".format(i), ) self.out_layers.append((weight, bias)) - super(TFAdaptiveSoftmaxMask, self).build(input_shape) + super().build(input_shape) @staticmethod def _logit(x, W, b, proj=None): diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index dacf4cd752..490ced3eb4 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -78,7 +78,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin): return {"input_ids": tf.constant(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): - super(TFPreTrainedModel, self).__init__(*inputs, **kwargs) + super().__init__(*inputs, **kwargs) if not isinstance(config, PretrainedConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " @@ -385,7 +385,7 @@ class TFConv1D(tf.keras.layers.Layer): """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) Basically works like a Linear layer but the weights are transposed """ - super(TFConv1D, self).__init__(**kwargs) + super().__init__(**kwargs) self.nf = nf self.nx = nx self.initializer_range = initializer_range @@ -412,7 +412,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): """ def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs): - super(TFSharedEmbeddings, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range @@ -425,7 +425,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): self.weight = self.add_weight( "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) ) - super(TFSharedEmbeddings, self).build(input_shape) + super().build(input_shape) def call(self, inputs, mode="embedding"): """Get token embeddings of inputs. @@ -485,7 +485,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): """ def __init__(self, config, initializer_range=0.02, **kwargs): - super(TFSequenceSummary, self).__init__(**kwargs) + super().__init__(**kwargs) self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" if self.summary_type == "attn": diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index b7b827cb25..c82f5a69bd 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -97,7 +97,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config, **kwargs): - super(TFMultiHeadAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.layer_id = next(TFMultiHeadAttention.NEW_ID) self.output_attentions = config.output_attentions self.dim = dim @@ -182,7 +182,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): class TFTransformerFFN(tf.keras.layers.Layer): def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): - super(TFTransformerFFN, self).__init__(**kwargs) + super().__init__(**kwargs) self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1") self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2") self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu @@ -198,7 +198,7 @@ class TFTransformerFFN(tf.keras.layers.Layer): class TFXLMMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFXLMMainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -608,7 +608,7 @@ class TFXLMModel(TFXLMPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLMModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") def call(self, inputs, **kwargs): @@ -622,7 +622,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer): """ def __init__(self, config, input_embeddings, **kwargs): - super(TFXLMPredLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.asm = config.asm self.n_words = config.n_words self.pad_index = config.pad_index @@ -641,7 +641,7 @@ class TFXLMPredLayer(tf.keras.layers.Layer): def build(self, input_shape): # The output weights are the same as the input embeddings, but there is an output-only bias for each token. self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias") - super(TFXLMPredLayer, self).build(input_shape) + super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") @@ -682,7 +682,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") @@ -733,7 +733,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLMMainLayer(config, name="transformer") @@ -784,7 +784,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXLMMainLayer(config, name="transformer") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs" diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index 1827086f05..bafbc31d87 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -57,7 +57,7 @@ ACT2FN = { class TFXLNetRelativeAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFXLNetRelativeAttention, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions if config.d_model % config.n_head != 0: @@ -104,7 +104,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): self.seg_embed = self.add_weight( shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed" ) - super(TFXLNetRelativeAttention, self).build(input_shape) + super().build(input_shape) def prune_heads(self, heads): raise NotImplementedError @@ -280,7 +280,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): class TFXLNetFeedForward(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFXLNetFeedForward, self).__init__(**kwargs) + super().__init__(**kwargs) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") self.layer_1 = tf.keras.layers.Dense( config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1" @@ -307,7 +307,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer): class TFXLNetLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFXLNetLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn") self.ff = TFXLNetFeedForward(config, name="ff") self.dropout = tf.keras.layers.Dropout(config.dropout) @@ -326,7 +326,7 @@ class TFXLNetLayer(tf.keras.layers.Layer): class TFXLNetLMHead(tf.keras.layers.Layer): def __init__(self, config, input_embeddings, **kwargs): - super(TFXLNetLMHead, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = config.vocab_size # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. @@ -334,7 +334,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): def build(self, input_shape): self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") - super(TFXLNetLMHead, self).build(input_shape) + super().build(input_shape) def call(self, hidden_states): hidden_states = self.input_embeddings(hidden_states, mode="linear") @@ -344,7 +344,7 @@ class TFXLNetLMHead(tf.keras.layers.Layer): class TFXLNetMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFXLNetMainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.output_past = config.output_past @@ -832,7 +832,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLNetModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") def call(self, inputs, **kwargs): @@ -885,7 +885,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss") @@ -940,7 +940,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name="transformer") @@ -1001,7 +1001,7 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXLNetMainLayer(config, name="transformer") @@ -1058,7 +1058,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" @@ -1127,7 +1127,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): # """ # def __init__(self, config, *inputs, **kwargs): -# super(TFXLNetForQuestionAnswering, self).__init__(config, *inputs, **kwargs) +# super().__init__(config, *inputs, **kwargs) # self.start_n_top = config.start_n_top # self.end_n_top = config.end_n_top diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index 3977a4b895..1266c3a355 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -165,7 +165,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path): class PositionalEmbedding(nn.Module): def __init__(self, demb): - super(PositionalEmbedding, self).__init__() + super().__init__() self.demb = demb @@ -184,7 +184,7 @@ class PositionalEmbedding(nn.Module): class PositionwiseFF(nn.Module): def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5): - super(PositionwiseFF, self).__init__() + super().__init__() self.d_model = d_model self.d_inner = d_inner @@ -236,7 +236,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): output_attentions=False, layer_norm_epsilon=1e-5, ): - super(RelPartialLearnableMultiHeadAttn, self).__init__() + super().__init__() self.output_attentions = output_attentions self.n_head = n_head @@ -368,7 +368,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): class RelPartialLearnableDecoderLayer(nn.Module): def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs): - super(RelPartialLearnableDecoderLayer, self).__init__() + super().__init__() self.dec_attn = RelPartialLearnableMultiHeadAttn( n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs @@ -389,7 +389,7 @@ class RelPartialLearnableDecoderLayer(nn.Module): class AdaptiveEmbedding(nn.Module): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False): - super(AdaptiveEmbedding, self).__init__() + super().__init__() self.n_token = n_token self.d_embed = d_embed @@ -587,7 +587,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): """ def __init__(self, config): - super(TransfoXLModel, self).__init__(config) + super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -845,7 +845,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): """ def __init__(self, config): - super(TransfoXLLMHeadModel, self).__init__(config) + super().__init__(config) self.transformer = TransfoXLModel(config) self.sample_softmax = config.sample_softmax # use sampled softmax diff --git a/src/transformers/modeling_transfo_xl_utilities.py b/src/transformers/modeling_transfo_xl_utilities.py index 63900c7b80..ef12316673 100644 --- a/src/transformers/modeling_transfo_xl_utilities.py +++ b/src/transformers/modeling_transfo_xl_utilities.py @@ -29,7 +29,7 @@ import torch.nn.functional as F class ProjectedAdaptiveLogSoftmax(nn.Module): def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False): - super(ProjectedAdaptiveLogSoftmax, self).__init__() + super().__init__() self.n_token = n_token self.d_embed = d_embed diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9543f3bdc0..b2483407e3 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -47,7 +47,7 @@ except ImportError: """ def __init__(self, *args, **kwargs): - super(Identity, self).__init__() + super().__init__() def forward(self, input): return input @@ -97,7 +97,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin): return {"input_ids": torch.tensor(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): - super(PreTrainedModel, self).__init__() + super().__init__() if not isinstance(config, PretrainedConfig): raise ValueError( "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " @@ -1102,7 +1102,7 @@ class Conv1D(nn.Module): """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) Basically works like a Linear layer but the weights are transposed """ - super(Conv1D, self).__init__() + super().__init__() self.nf = nf w = torch.empty(nx, nf) nn.init.normal_(w, std=0.02) @@ -1120,7 +1120,7 @@ class PoolerStartLogits(nn.Module): """ Compute SQuAD start_logits from sequence hidden states. """ def __init__(self, config): - super(PoolerStartLogits, self).__init__() + super().__init__() self.dense = nn.Linear(config.hidden_size, 1) def forward(self, hidden_states, p_mask=None): @@ -1145,7 +1145,7 @@ class PoolerEndLogits(nn.Module): """ def __init__(self, config): - super(PoolerEndLogits, self).__init__() + super().__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) self.activation = nn.Tanh() self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -1191,7 +1191,7 @@ class PoolerAnswerClass(nn.Module): """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ def __init__(self, config): - super(PoolerAnswerClass, self).__init__() + super().__init__() self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) self.activation = nn.Tanh() self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) @@ -1276,7 +1276,7 @@ class SQuADHead(nn.Module): """ def __init__(self, config): - super(SQuADHead, self).__init__() + super().__init__() self.start_n_top = config.start_n_top self.end_n_top = config.end_n_top @@ -1368,7 +1368,7 @@ class SequenceSummary(nn.Module): """ def __init__(self, config): - super(SequenceSummary, self).__init__() + super().__init__() self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last" if self.summary_type == "attn": diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 8d1836b7ba..74f9574e42 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -96,7 +96,7 @@ class MultiHeadAttention(nn.Module): NEW_ID = itertools.count() def __init__(self, n_heads, dim, config): - super(MultiHeadAttention, self).__init__() + super().__init__() self.layer_id = next(MultiHeadAttention.NEW_ID) self.output_attentions = config.output_attentions self.dim = dim @@ -197,7 +197,7 @@ class MultiHeadAttention(nn.Module): class TransformerFFN(nn.Module): def __init__(self, in_dim, dim_hidden, out_dim, config): - super(TransformerFFN, self).__init__() + super().__init__() self.dropout = config.dropout self.lin1 = nn.Linear(in_dim, dim_hidden) self.lin2 = nn.Linear(dim_hidden, out_dim) @@ -222,7 +222,7 @@ class XLMPreTrainedModel(PreTrainedModel): base_model_prefix = "transformer" def __init__(self, *inputs, **kwargs): - super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs) + super().__init__(*inputs, **kwargs) @property def dummy_inputs(self): @@ -354,7 +354,7 @@ class XLMModel(XLMPreTrainedModel): """ def __init__(self, config): # , dico, is_encoder, with_output): - super(XLMModel, self).__init__(config) + super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states @@ -585,7 +585,7 @@ class XLMPredLayer(nn.Module): """ def __init__(self, config): - super(XLMPredLayer, self).__init__() + super().__init__() self.asm = config.asm self.n_words = config.n_words self.pad_index = config.pad_index @@ -661,7 +661,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): """ def __init__(self, config): - super(XLMWithLMHeadModel, self).__init__(config) + super().__init__(config) self.transformer = XLMModel(config) self.pred_layer = XLMPredLayer(config) @@ -754,7 +754,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): """ def __init__(self, config): - super(XLMForSequenceClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.transformer = XLMModel(config) @@ -856,7 +856,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): """ def __init__(self, config): - super(XLMForQuestionAnsweringSimple, self).__init__(config) + super().__init__(config) self.transformer = XLMModel(config) self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) @@ -973,7 +973,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): """ def __init__(self, config): - super(XLMForQuestionAnswering, self).__init__(config) + super().__init__(config) self.transformer = XLMModel(config) self.qa_outputs = SQuADHead(config) diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index f6e76dc197..87147338d3 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -204,7 +204,7 @@ XLNetLayerNorm = nn.LayerNorm class XLNetRelativeAttention(nn.Module): def __init__(self, config): - super(XLNetRelativeAttention, self).__init__() + super().__init__() self.output_attentions = config.output_attentions if config.d_model % config.n_head != 0: @@ -414,7 +414,7 @@ class XLNetRelativeAttention(nn.Module): class XLNetFeedForward(nn.Module): def __init__(self, config): - super(XLNetFeedForward, self).__init__() + super().__init__() self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps) self.layer_1 = nn.Linear(config.d_model, config.d_inner) self.layer_2 = nn.Linear(config.d_inner, config.d_model) @@ -437,7 +437,7 @@ class XLNetFeedForward(nn.Module): class XLNetLayer(nn.Module): def __init__(self, config): - super(XLNetLayer, self).__init__() + super().__init__() self.rel_attn = XLNetRelativeAttention(config) self.ff = XLNetFeedForward(config) self.dropout = nn.Dropout(config.dropout) @@ -631,7 +631,7 @@ class XLNetModel(XLNetPreTrainedModel): """ def __init__(self, config): - super(XLNetModel, self).__init__(config) + super().__init__(config) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.output_past = config.output_past @@ -996,7 +996,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): """ def __init__(self, config): - super(XLNetLMHeadModel, self).__init__(config) + super().__init__(config) self.attn_type = config.attn_type self.same_length = config.same_length @@ -1119,7 +1119,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): """ def __init__(self, config): - super(XLNetForSequenceClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) @@ -1234,7 +1234,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): """ def __init__(self, config): - super(XLNetForTokenClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) @@ -1355,7 +1355,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): """ def __init__(self, config): - super(XLNetForMultipleChoice, self).__init__(config) + super().__init__(config) self.transformer = XLNetModel(config) self.sequence_summary = SequenceSummary(config) @@ -1463,7 +1463,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): """ def __init__(self, config): - super(XLNetForQuestionAnsweringSimple, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.transformer = XLNetModel(config) @@ -1595,7 +1595,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): """ def __init__(self, config): - super(XLNetForQuestionAnswering, self).__init__(config) + super().__init__(config) self.start_n_top = config.start_n_top self.end_n_top = config.end_n_top diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 814a0c5ba1..5ab7647638 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -114,7 +114,7 @@ class AdamW(Optimizer): if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias) - super(AdamW, self).__init__(params, defaults) + super().__init__(params, defaults) def step(self, closure=None): """Performs a single optimization step. diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py index d4e0635719..d232370905 100644 --- a/src/transformers/optimization_tf.py +++ b/src/transformers/optimization_tf.py @@ -24,7 +24,7 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule): """Applys a warmup schedule on a given learning rate decay schedule.""" def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None): - super(WarmUp, self).__init__() + super().__init__() self.initial_learning_rate = initial_learning_rate self.warmup_steps = warmup_steps self.power = power @@ -102,7 +102,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): name="AdamWeightDecay", **kwargs ): - super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) + super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) self.weight_decay_rate = weight_decay_rate self._include_in_weight_decay = include_in_weight_decay self._exclude_from_weight_decay = exclude_from_weight_decay @@ -111,10 +111,10 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): def from_config(cls, config): """Creates an optimizer from its config with WarmUp custom object.""" custom_objects = {"WarmUp": WarmUp} - return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects) + return super().from_config(config, custom_objects=custom_objects) def _prepare_local(self, var_device, var_dtype, apply_state): - super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state) + super()._prepare_local(var_device, var_dtype, apply_state) apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate") def _decay_weights_op(self, var, learning_rate, apply_state): @@ -128,7 +128,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): def apply_gradients(self, grads_and_vars, clip_norm, name=None): grads, tvars = list(zip(*grads_and_vars)) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm) - return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars)) + return super().apply_gradients(zip(grads, tvars)) def _get_lr(self, var_device, var_dtype, apply_state): """Retrieves the learning rate with the given state.""" @@ -147,16 +147,16 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): - return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs) + return super()._resource_apply_dense(grad, var, **kwargs) def _resource_apply_sparse(self, grad, var, indices, apply_state=None): lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): - return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs) + return super()._resource_apply_sparse(grad, var, indices, **kwargs) def get_config(self): - config = super(AdamWeightDecay, self).get_config() + config = super().get_config() config.update({"weight_decay_rate": self.weight_decay_rate}) return config diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py index f8d058a725..985f82c6fd 100644 --- a/src/transformers/tokenization_albert.py +++ b/src/transformers/tokenization_albert.py @@ -79,7 +79,7 @@ class AlbertTokenizer(PreTrainedTokenizer): mask_token="[MASK]", **kwargs ): - super(AlbertTokenizer, self).__init__( + super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index 1e817c54a3..ac14370a18 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -163,7 +163,7 @@ class BertTokenizer(PreTrainedTokenizer): This should likely be deactivated for Japanese: see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 """ - super(BertTokenizer, self).__init__( + super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, @@ -554,7 +554,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast): add_special_tokens=True, **kwargs ): - super(BertTokenizerFast, self).__init__( + super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py index 346d12360c..40a5d535c0 100644 --- a/src/transformers/tokenization_bert_japanese.py +++ b/src/transformers/tokenization_bert_japanese.py @@ -107,7 +107,7 @@ class BertJapaneseTokenizer(BertTokenizer): **subword_tokenizer_type**: (`optional`) string (default "wordpiece") Type of subword tokenizer. """ - super(BertTokenizer, self).__init__( + super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py index 038856be8b..ce0001a743 100644 --- a/src/transformers/tokenization_camembert.py +++ b/src/transformers/tokenization_camembert.py @@ -66,7 +66,7 @@ class CamembertTokenizer(PreTrainedTokenizer): additional_special_tokens=["NOTUSED", "NOTUSED"], **kwargs ): - super(CamembertTokenizer, self).__init__( + super().__init__( max_len=512, bos_token=bos_token, eos_token=eos_token, diff --git a/src/transformers/tokenization_ctrl.py b/src/transformers/tokenization_ctrl.py index 99aa59d59d..1f2184f0a1 100644 --- a/src/transformers/tokenization_ctrl.py +++ b/src/transformers/tokenization_ctrl.py @@ -126,7 +126,7 @@ class CTRLTokenizer(PreTrainedTokenizer): control_codes = CONTROL_CODES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): - super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs) + super().__init__(unk_token=unk_token, **kwargs) self.max_len_single_sentence = ( self.max_len ) # no default special tokens - you can update this value if you add special tokens diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index c99ec08ffc..d733f6645a 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -122,7 +122,7 @@ class GPT2Tokenizer(PreTrainedTokenizer): eos_token="<|endoftext|>", **kwargs ): - super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) + super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.max_len_single_sentence = ( self.max_len ) # no default special tokens - you can update this value if you add special tokens @@ -268,7 +268,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): truncation_strategy="longest_first", **kwargs ): - super(GPT2TokenizerFast, self).__init__( + super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs ) diff --git a/src/transformers/tokenization_openai.py b/src/transformers/tokenization_openai.py index 25ea0f88ef..eca9f81c3e 100644 --- a/src/transformers/tokenization_openai.py +++ b/src/transformers/tokenization_openai.py @@ -82,7 +82,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): - super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) + super().__init__(unk_token=unk_token, **kwargs) self.max_len_single_sentence = ( self.max_len diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index e14f12f449..caaaf98cd0 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -84,7 +84,7 @@ class RobertaTokenizer(GPT2Tokenizer): mask_token="", **kwargs ): - super(RobertaTokenizer, self).__init__( + super().__init__( vocab_file=vocab_file, merges_file=merges_file, errors=errors, diff --git a/src/transformers/tokenization_t5.py b/src/transformers/tokenization_t5.py index b010843d42..2196cc82e7 100644 --- a/src/transformers/tokenization_t5.py +++ b/src/transformers/tokenization_t5.py @@ -91,7 +91,7 @@ class T5Tokenizer(PreTrainedTokenizer): additional_special_tokens = [] additional_special_tokens.extend(["".format(i) for i in range(extra_ids)]) - super(T5Tokenizer, self).__init__( + super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py index b27544249e..9d847e6f8c 100644 --- a/src/transformers/tokenization_transfo_xl.py +++ b/src/transformers/tokenization_transfo_xl.py @@ -78,7 +78,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer): additional_special_tokens=[""], **kwargs ): - super(TransfoXLTokenizer, self).__init__( + super().__init__( unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs ) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index c697cfa969..f05ad23654 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -1425,7 +1425,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): _decoder = None def __init__(self, **kwargs): - super(PreTrainedTokenizerFast, self).__init__(**kwargs) + super().__init__(**kwargs) @property def tokenizer(self): diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py index 676321a164..518f3dd7ff 100644 --- a/src/transformers/tokenization_xlm.py +++ b/src/transformers/tokenization_xlm.py @@ -578,7 +578,7 @@ class XLMTokenizer(PreTrainedTokenizer): do_lowercase_and_remove_accent=True, **kwargs ): - super(XLMTokenizer, self).__init__( + super().__init__( unk_token=unk_token, bos_token=bos_token, sep_token=sep_token, diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py index 5fe624e6bd..0b373b1eea 100644 --- a/src/transformers/tokenization_xlm_roberta.py +++ b/src/transformers/tokenization_xlm_roberta.py @@ -75,7 +75,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): mask_token="", **kwargs ): - super(XLMRobertaTokenizer, self).__init__( + super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py index 14f0d26b72..fe5036fb43 100644 --- a/src/transformers/tokenization_xlnet.py +++ b/src/transformers/tokenization_xlnet.py @@ -77,7 +77,7 @@ class XLNetTokenizer(PreTrainedTokenizer): additional_special_tokens=["", ""], **kwargs ): - super(XLNetTokenizer, self).__init__( + super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py index 3c3c28a8c0..d23bce43d2 100644 --- a/templates/adding_a_new_model/configuration_xxx.py +++ b/templates/adding_a_new_model/configuration_xxx.py @@ -80,7 +80,7 @@ class XxxConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs ): - super(XxxConfig, self).__init__(**kwargs) + super().__init__(**kwargs) self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py index 520007ff85..4e3791e481 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/modeling_tf_xxx.py @@ -69,7 +69,7 @@ TFXxxOutput = tf.keras.layers.Layer class TFXxxLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFXxxLayer, self).__init__(**kwargs) + super().__init__(**kwargs) self.attention = TFXxxAttention(config, name="attention") self.intermediate = TFXxxIntermediate(config, name="intermediate") self.transformer_output = TFXxxOutput(config, name="output") @@ -91,7 +91,7 @@ class TFXxxLayer(tf.keras.layers.Layer): #################################################### class TFXxxMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFXxxMainLayer, self).__init__(**kwargs) + super().__init__(**kwargs) def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models @@ -307,7 +307,7 @@ class TFXxxModel(TFXxxPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXxxModel, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXxxMainLayer(config, name="transformer") def call(self, inputs, **kwargs): @@ -348,7 +348,7 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.transformer = TFXxxMainLayer(config, name="transformer") self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") @@ -397,7 +397,7 @@ class TFXxxForSequenceClassification(TFXxxPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXxxMainLayer(config, name="transformer") @@ -452,7 +452,7 @@ class TFXxxForTokenClassification(TFXxxPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXxxMainLayer(config, name="transformer") @@ -509,7 +509,7 @@ class TFXxxForQuestionAnswering(TFXxxPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): - super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs) + super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels self.transformer = TFXxxMainLayer(config, name="transformer") diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index 0431f3936f..f9f4daa950 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -138,7 +138,7 @@ XxxOutput = nn.Module class XxxLayer(nn.Module): def __init__(self, config): - super(XxxLayer, self).__init__() + super().__init__() self.attention = XxxAttention(config) self.intermediate = XxxIntermediate(config) self.output = XxxOutput(config) @@ -298,7 +298,7 @@ class XxxModel(XxxPreTrainedModel): """ def __init__(self, config): - super(XxxModel, self).__init__(config) + super().__init__(config) self.embeddings = XxxEmbeddings(config) self.encoder = XxxEncoder(config) @@ -426,7 +426,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): """ def __init__(self, config): - super(XxxForMaskedLM, self).__init__(config) + super().__init__(config) self.transformer = XxxModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size) @@ -507,7 +507,7 @@ class XxxForSequenceClassification(XxxPreTrainedModel): """ def __init__(self, config): - super(XxxForSequenceClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.transformer = XxxModel(config) @@ -593,7 +593,7 @@ class XxxForTokenClassification(XxxPreTrainedModel): """ def __init__(self, config): - super(XxxForTokenClassification, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.transformer = XxxModel(config) @@ -692,7 +692,7 @@ class XxxForQuestionAnswering(XxxPreTrainedModel): """ def __init__(self, config): - super(XxxForQuestionAnswering, self).__init__(config) + super().__init__(config) self.num_labels = config.num_labels self.transformer = XxxModel(config) diff --git a/templates/adding_a_new_model/tests/test_tokenization_xxx.py b/templates/adding_a_new_model/tests/test_tokenization_xxx.py index cca0a1a15b..1a24f76b0f 100644 --- a/templates/adding_a_new_model/tests/test_tokenization_xxx.py +++ b/templates/adding_a_new_model/tests/test_tokenization_xxx.py @@ -27,7 +27,7 @@ class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XxxTokenizer def setUp(self): - super(XxxTokenizationTest, self).setUp() + super().setUp() vocab_tokens = [ "[UNK]", diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index dabb14be9a..667a130a9b 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -109,7 +109,7 @@ class XxxTokenizer(PreTrainedTokenizer): Whether to lower case the input Only has an effect when do_basic_tokenize=True """ - super(XxxTokenizer, self).__init__( + super().__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, diff --git a/tests/test_tokenization_albert.py b/tests/test_tokenization_albert.py index 3b325fb7bf..c190d8ed82 100644 --- a/tests/test_tokenization_albert.py +++ b/tests/test_tokenization_albert.py @@ -30,7 +30,7 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = AlbertTokenizer def setUp(self): - super(AlbertTokenizationTest, self).setUp() + super().setUp() # We have a SentencePiece fixture for testing tokenizer = AlbertTokenizer(SAMPLE_VOCAB) diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py index c83611206e..793bb8fa54 100644 --- a/tests/test_tokenization_bert.py +++ b/tests/test_tokenization_bert.py @@ -38,7 +38,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True def setUp(self): - super(BertTokenizationTest, self).setUp() + super().setUp() vocab_tokens = [ "[UNK]", diff --git a/tests/test_tokenization_bert_japanese.py b/tests/test_tokenization_bert_japanese.py index 8680296d6b..4900ff49da 100644 --- a/tests/test_tokenization_bert_japanese.py +++ b/tests/test_tokenization_bert_japanese.py @@ -35,7 +35,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertJapaneseTokenizer def setUp(self): - super(BertJapaneseTokenizationTest, self).setUp() + super().setUp() vocab_tokens = [ "[UNK]", @@ -135,7 +135,7 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC tokenizer_class = BertJapaneseTokenizer def setUp(self): - super(BertJapaneseCharacterTokenizationTest, self).setUp() + super().setUp() vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"] diff --git a/tests/test_tokenization_ctrl.py b/tests/test_tokenization_ctrl.py index 1cea3f17e3..8b57dc49d3 100644 --- a/tests/test_tokenization_ctrl.py +++ b/tests/test_tokenization_ctrl.py @@ -27,7 +27,7 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CTRLTokenizer def setUp(self): - super(CTRLTokenizationTest, self).setUp() + super().setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", ""] diff --git a/tests/test_tokenization_gpt2.py b/tests/test_tokenization_gpt2.py index 1967b7a758..12b7b0eeb1 100644 --- a/tests/test_tokenization_gpt2.py +++ b/tests/test_tokenization_gpt2.py @@ -29,7 +29,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True def setUp(self): - super(GPT2TokenizationTest, self).setUp() + super().setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ diff --git a/tests/test_tokenization_openai.py b/tests/test_tokenization_openai.py index a972ebe24f..f89ec61ff6 100644 --- a/tests/test_tokenization_openai.py +++ b/tests/test_tokenization_openai.py @@ -28,7 +28,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = OpenAIGPTTokenizer def setUp(self): - super(OpenAIGPTTokenizationTest, self).setUp() + super().setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ diff --git a/tests/test_tokenization_roberta.py b/tests/test_tokenization_roberta.py index 26f71b3275..f9abdea666 100644 --- a/tests/test_tokenization_roberta.py +++ b/tests/test_tokenization_roberta.py @@ -28,7 +28,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = RobertaTokenizer def setUp(self): - super(RobertaTokenizationTest, self).setUp() + super().setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 061bc88598..793d80ac64 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -31,7 +31,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = T5Tokenizer def setUp(self): - super(T5TokenizationTest, self).setUp() + super().setUp() # We have a SentencePiece fixture for testing tokenizer = T5Tokenizer(SAMPLE_VOCAB) diff --git a/tests/test_tokenization_transfo_xl.py b/tests/test_tokenization_transfo_xl.py index b108db8aea..8d4814699e 100644 --- a/tests/test_tokenization_transfo_xl.py +++ b/tests/test_tokenization_transfo_xl.py @@ -33,7 +33,7 @@ class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = TransfoXLTokenizer if is_torch_available() else None def setUp(self): - super(TransfoXLTokenizationTest, self).setUp() + super().setUp() vocab_tokens = [ "", diff --git a/tests/test_tokenization_xlm.py b/tests/test_tokenization_xlm.py index ff7a005b30..5fd7379388 100644 --- a/tests/test_tokenization_xlm.py +++ b/tests/test_tokenization_xlm.py @@ -29,7 +29,7 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMTokenizer def setUp(self): - super(XLMTokenizationTest, self).setUp() + super().setUp() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ diff --git a/tests/test_tokenization_xlnet.py b/tests/test_tokenization_xlnet.py index d970f37f69..2fa94bfbc9 100644 --- a/tests/test_tokenization_xlnet.py +++ b/tests/test_tokenization_xlnet.py @@ -31,7 +31,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLNetTokenizer def setUp(self): - super(XLNetTokenizationTest, self).setUp() + super().setUp() # We have a SentencePiece fixture for testing tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) From a98b2ca8c03125af900a62f591681140de329fdd Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 15 Jan 2020 19:05:51 -0500 Subject: [PATCH 2/2] Style + fixup BertJapaneseTokenizer --- src/transformers/modeling_tf_roberta.py | 4 +--- src/transformers/tokenization_bert_japanese.py | 3 ++- src/transformers/tokenization_gpt2.py | 4 +--- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index 2821236f57..0efe3e0cda 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -78,9 +78,7 @@ class TFRobertaEmbeddings(TFBertEmbeddings): else: position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) - return super()._embedding( - [input_ids, position_ids, token_type_ids, inputs_embeds], training=training - ) + return super()._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) class TFRobertaMainLayer(TFBertMainLayer): diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py index 40a5d535c0..aaf82c54b3 100644 --- a/src/transformers/tokenization_bert_japanese.py +++ b/src/transformers/tokenization_bert_japanese.py @@ -107,7 +107,7 @@ class BertJapaneseTokenizer(BertTokenizer): **subword_tokenizer_type**: (`optional`) string (default "wordpiece") Type of subword tokenizer. """ - super().__init__( + super(BertTokenizer, self).__init__( unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, @@ -115,6 +115,7 @@ class BertJapaneseTokenizer(BertTokenizer): mask_token=mask_token, **kwargs, ) + # ^^ We call the grandparent's init, not the parent's. self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index d733f6645a..4f2de845b5 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -268,9 +268,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): truncation_strategy="longest_first", **kwargs ): - super().__init__( - bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs - ) + super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self._tokenizer = tk.Tokenizer(tk.models.BPE.from_files(vocab_file, merges_file)) self._update_special_tokens()