From 7732d0fe7a759c9844215920e9f1c5540eafb1a6 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 9 Feb 2022 09:28:57 -0500 Subject: [PATCH] Upgrade black to version ~=22.0 (#15565) * Upgrade black to version ~=22.0 * Check copies * Fix code --- .../run_wav2vec2_pretraining_no_trainer.py | 4 +- .../distillation/run_squad_w_distillation.py | 22 ++++----- .../wav2vec2/run_wav2vec2_pretrain_flax.py | 2 +- .../lxmert/modeling_frcnn.py | 4 +- .../movement-pruning/masked_run_glue.py | 15 +++--- .../movement-pruning/masked_run_squad.py | 28 +++++------ .../bart_onnx/generation_onnx.py | 2 +- examples/research_projects/pplm/run_pplm.py | 2 +- .../visual_bert/modeling_frcnn.py | 4 +- .../wav2vec2/run_pretrain.py | 4 +- setup.py | 4 +- .../commands/add_new_model_like.py | 12 ++--- src/transformers/dependency_versions_table.py | 2 +- src/transformers/generation_beam_search.py | 2 +- src/transformers/generation_flax_utils.py | 4 +- src/transformers/generation_tf_utils.py | 2 +- src/transformers/modeling_tf_utils.py | 2 +- src/transformers/models/bart/modeling_bart.py | 2 +- .../models/bart/modeling_tf_bart.py | 2 +- .../models/bart/tokenization_bart.py | 4 +- .../models/big_bird/modeling_big_bird.py | 2 +- .../models/big_bird/modeling_flax_big_bird.py | 2 +- .../modeling_bigbird_pegasus.py | 2 +- .../models/blenderbot/modeling_blenderbot.py | 2 +- .../blenderbot/modeling_tf_blenderbot.py | 2 +- .../modeling_blenderbot_small.py | 2 +- .../modeling_tf_blenderbot_small.py | 2 +- .../models/byt5/tokenization_byt5.py | 2 +- src/transformers/models/clip/modeling_clip.py | 14 +++--- .../models/clip/modeling_flax_clip.py | 2 +- .../models/clip/modeling_tf_clip.py | 12 ++--- .../models/clip/tokenization_clip.py | 4 +- src/transformers/models/detr/modeling_detr.py | 2 +- src/transformers/models/fsmt/modeling_fsmt.py | 2 +- .../models/funnel/modeling_funnel.py | 6 +-- .../models/funnel/modeling_tf_funnel.py | 6 +-- .../models/gpt2/tokenization_gpt2.py | 4 +- .../models/hubert/modeling_hubert.py | 2 +- .../models/hubert/modeling_tf_hubert.py | 2 +- .../models/ibert/quant_modules.py | 48 +++++++++---------- src/transformers/models/led/modeling_led.py | 2 +- .../models/led/modeling_tf_led.py | 2 +- .../longformer/modeling_tf_longformer.py | 11 ++--- .../models/m2m_100/modeling_m2m_100.py | 2 +- .../models/marian/modeling_marian.py | 2 +- .../models/marian/modeling_tf_marian.py | 2 +- .../models/mbart/modeling_mbart.py | 2 +- .../models/mbart/modeling_tf_mbart.py | 2 +- .../models/pegasus/modeling_pegasus.py | 2 +- .../models/pegasus/modeling_tf_pegasus.py | 2 +- .../models/perceiver/modeling_perceiver.py | 10 ++-- .../perceiver/tokenization_perceiver.py | 2 +- .../models/prophetnet/modeling_prophetnet.py | 4 +- .../models/reformer/modeling_reformer.py | 4 +- .../models/roberta/tokenization_roberta.py | 4 +- src/transformers/models/sew/modeling_sew.py | 2 +- .../feature_extraction_speech_to_text.py | 2 +- .../speech_to_text/modeling_speech_to_text.py | 2 +- .../modeling_tf_speech_to_text.py | 2 +- .../modeling_speech_to_text_2.py | 2 +- src/transformers/models/swin/modeling_swin.py | 4 +- .../models/t5/modeling_flax_t5.py | 16 +++---- src/transformers/models/t5/modeling_t5.py | 6 +-- src/transformers/models/t5/modeling_tf_t5.py | 18 +++---- .../models/tapas/modeling_tapas.py | 2 +- .../transfo_xl/modeling_tf_transfo_xl.py | 8 ++-- .../modeling_tf_transfo_xl_utilities.py | 2 +- .../models/transfo_xl/modeling_transfo_xl.py | 6 +-- .../modeling_transfo_xl_utilities.py | 2 +- .../models/trocr/modeling_trocr.py | 2 +- .../models/unispeech/modeling_unispeech.py | 2 +- .../unispeech_sat/modeling_unispeech_sat.py | 2 +- .../models/vit_mae/modeling_vit_mae.py | 10 ++-- .../models/wav2vec2/modeling_tf_wav2vec2.py | 2 +- .../models/wav2vec2/modeling_wav2vec2.py | 2 +- .../models/wavlm/modeling_wavlm.py | 2 +- src/transformers/models/xglm/modeling_xglm.py | 2 +- .../models/xlm/configuration_xlm.py | 2 +- .../models/xlnet/modeling_tf_xlnet.py | 2 +- .../models/xlnet/modeling_xlnet.py | 2 +- src/transformers/models/yoso/modeling_yoso.py | 4 +- src/transformers/optimization.py | 4 +- src/transformers/pipelines/audio_utils.py | 2 +- tests/test_generation_beam_search.py | 2 +- tests/test_modeling_ibert.py | 2 +- tests/test_modeling_swin.py | 2 +- tests/test_modeling_vit_mae.py | 2 +- tests/test_utils_check_copies.py | 3 +- utils/check_copies.py | 5 +- utils/get_modified_files.py | 2 +- utils/style_doc.py | 5 +- 91 files changed, 208 insertions(+), 225 deletions(-) diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index c7134cb9ea..fdce3a22f4 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -350,7 +350,7 @@ def get_grad_norm(params, scale=1): if p.grad is not None: param_norm = (p.grad.detach().data / scale).norm(2) total_norm += param_norm.item() ** 2 - total_norm = total_norm ** 0.5 + total_norm = total_norm**0.5 return total_norm @@ -619,7 +619,7 @@ def main(): # update gumbel temperature gumbel_temperature = max( - args.max_gumbel_temperature * args.gumbel_temperature_decay ** completed_steps, + args.max_gumbel_temperature * args.gumbel_temperature_decay**completed_steps, args.min_gumbel_temperature, ) if hasattr(model, "module"): diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py index e73f8f1c43..ea1f2f46a9 100644 --- a/examples/research_projects/distillation/run_squad_w_distillation.py +++ b/examples/research_projects/distillation/run_squad_w_distillation.py @@ -229,20 +229,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None): assert end_logits_tea.size() == end_logits_stu.size() loss_fct = nn.KLDivLoss(reduction="batchmean") - loss_start = ( - loss_fct( - nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1), - nn.functional.softmax(start_logits_tea / args.temperature, dim=-1), - ) - * (args.temperature ** 2) - ) - loss_end = ( - loss_fct( - nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1), - nn.functional.softmax(end_logits_tea / args.temperature, dim=-1), - ) - * (args.temperature ** 2) - ) + loss_start = loss_fct( + nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1), + nn.functional.softmax(start_logits_tea / args.temperature, dim=-1), + ) * (args.temperature**2) + loss_end = loss_fct( + nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1), + nn.functional.softmax(end_logits_tea / args.temperature, dim=-1), + ) * (args.temperature**2) loss_ce = (loss_start + loss_end) / 2.0 loss = args.alpha_ce * loss_ce + args.alpha_squad * loss diff --git a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py index 4911ecb571..e2bcd7861b 100755 --- a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py +++ b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py @@ -450,7 +450,7 @@ def main(): negative_indices = batch.pop("sampled_negative_indices") gumbel_temperature = jnp.clip( - model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay ** state.step, + model_args.max_gumbel_temperature * model_args.gumbel_temperature_decay**state.step, a_min=model_args.min_gumbel_temperature, ) diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py index 89f01f4fca..39a0c6aea8 100644 --- a/examples/research_projects/lxmert/modeling_frcnn.py +++ b/examples/research_projects/lxmert/modeling_frcnn.py @@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module): self.feature_strides = {k: v.stride for k, v in input_shape.items()} self.feature_channels = {k: v.channels for k, v in input_shape.items()} self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG - self.stage_channel_factor = 2 ** 3 # res5 is 8x res2 + self.stage_channel_factor = 2**3 # res5 is 8x res2 self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor # self.proposal_matcher = Matcher( @@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module): anchors = [] for size in sizes: - area = size ** 2.0 + area = size**2.0 for aspect_ratio in aspect_ratios: w = math.sqrt(area / aspect_ratio) h = aspect_ratio * w diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py index 85a832a02d..57f795945b 100644 --- a/examples/research_projects/movement-pruning/masked_run_glue.py +++ b/examples/research_projects/movement-pruning/masked_run_glue.py @@ -84,7 +84,7 @@ def schedule_threshold( spars_warmup_steps = initial_warmup * warmup_steps spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps) - threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3) + threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3) regu_lambda = final_lambda * threshold / final_threshold return threshold, regu_lambda @@ -285,14 +285,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None): attention_mask=inputs["attention_mask"], ) - loss_logits = ( - nn.functional.kl_div( - input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1), - target=nn.functional.softmax(logits_tea / args.temperature, dim=-1), - reduction="batchmean", - ) - * (args.temperature ** 2) - ) + loss_logits = nn.functional.kl_div( + input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1), + target=nn.functional.softmax(logits_tea / args.temperature, dim=-1), + reduction="batchmean", + ) * (args.temperature**2) loss = args.alpha_distil * loss_logits + args.alpha_ce * loss diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py index 28b963f25d..f1d065f1f4 100644 --- a/examples/research_projects/movement-pruning/masked_run_squad.py +++ b/examples/research_projects/movement-pruning/masked_run_squad.py @@ -88,7 +88,7 @@ def schedule_threshold( spars_warmup_steps = initial_warmup * warmup_steps spars_schedu_steps = (final_warmup + initial_warmup) * warmup_steps mul_coeff = 1 - (step - spars_warmup_steps) / (total_step - spars_schedu_steps) - threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff ** 3) + threshold = final_threshold + (initial_threshold - final_threshold) * (mul_coeff**3) regu_lambda = final_lambda * threshold / final_threshold return threshold, regu_lambda @@ -306,22 +306,16 @@ def train(args, train_dataset, model, tokenizer, teacher=None): attention_mask=inputs["attention_mask"], ) - loss_start = ( - nn.functional.kl_div( - input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1), - target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1), - reduction="batchmean", - ) - * (args.temperature ** 2) - ) - loss_end = ( - nn.functional.kl_div( - input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1), - target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1), - reduction="batchmean", - ) - * (args.temperature ** 2) - ) + loss_start = nn.functional.kl_div( + input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1), + target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1), + reduction="batchmean", + ) * (args.temperature**2) + loss_end = nn.functional.kl_div( + input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1), + target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1), + reduction="batchmean", + ) * (args.temperature**2) loss_logits = (loss_start + loss_end) / 2.0 loss = args.alpha_distil * loss_logits + args.alpha_ce * loss diff --git a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py index 59d9c6c092..0ccab4ff10 100644 --- a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py +++ b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py @@ -442,7 +442,7 @@ class BeamSearchScorerTS(torch.nn.Module): elif self.do_early_stopping: return True else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty + cur_score = best_sum_logprobs / cur_len**self.length_penalty ret = self._beam_hyps_worst_scores[hypo_idx].item() >= cur_score return ret diff --git a/examples/research_projects/pplm/run_pplm.py b/examples/research_projects/pplm/run_pplm.py index 4872118433..fdbad60720 100644 --- a/examples/research_projects/pplm/run_pplm.py +++ b/examples/research_projects/pplm/run_pplm.py @@ -550,7 +550,7 @@ def generate_text_pplm( unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1) - pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale)) # + SMALL_CONST + pert_probs = (pert_probs**gm_scale) * (unpert_probs ** (1 - gm_scale)) # + SMALL_CONST pert_probs = top_k_filter(pert_probs, k=top_k, probs=True) # + SMALL_CONST # rescale diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py index 89f01f4fca..39a0c6aea8 100644 --- a/examples/research_projects/visual_bert/modeling_frcnn.py +++ b/examples/research_projects/visual_bert/modeling_frcnn.py @@ -1264,7 +1264,7 @@ class Res5ROIHeads(nn.Module): self.feature_strides = {k: v.stride for k, v in input_shape.items()} self.feature_channels = {k: v.channels for k, v in input_shape.items()} self.cls_agnostic_bbox_reg = cfg.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG - self.stage_channel_factor = 2 ** 3 # res5 is 8x res2 + self.stage_channel_factor = 2**3 # res5 is 8x res2 self.out_channels = cfg.RESNETS.RES2_OUT_CHANNELS * self.stage_channel_factor # self.proposal_matcher = Matcher( @@ -1419,7 +1419,7 @@ class AnchorGenerator(nn.Module): anchors = [] for size in sizes: - area = size ** 2.0 + area = size**2.0 for aspect_ratio in aspect_ratios: w = math.sqrt(area / aspect_ratio) h = aspect_ratio * w diff --git a/examples/research_projects/wav2vec2/run_pretrain.py b/examples/research_projects/wav2vec2/run_pretrain.py index 3b570ec156..248f32443f 100755 --- a/examples/research_projects/wav2vec2/run_pretrain.py +++ b/examples/research_projects/wav2vec2/run_pretrain.py @@ -273,11 +273,11 @@ class Wav2Vec2PreTrainer(Trainer): # make sure gumbel softmax temperature is decayed if self.args.n_gpu > 1 or self.deepspeed: model.module.set_gumbel_temperature( - max(self.max_gumbel_temp * self.gumbel_temp_decay ** self.num_update_step, self.min_gumbel_temp) + max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp) ) else: model.set_gumbel_temperature( - max(self.max_gumbel_temp * self.gumbel_temp_decay ** self.num_update_step, self.min_gumbel_temp) + max(self.max_gumbel_temp * self.gumbel_temp_decay**self.num_update_step, self.min_gumbel_temp) ) return loss.detach() diff --git a/setup.py b/setup.py index dd5401fb71..587ad72451 100644 --- a/setup.py +++ b/setup.py @@ -93,7 +93,7 @@ if stale_egg_info.exists(): # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py _deps = [ "Pillow", - "black==21.4b0", + "black~=22.0", "codecarbon==1.2.0", "cookiecutter==1.7.2", "dataclasses", @@ -166,7 +166,7 @@ _deps = [ # packaging: "packaging" # # some of the values are versioned whereas others aren't. -deps = {b: a for a, b in (re.findall(r"^(([^!=<>]+)(?:[!=<>].*)?$)", x)[0] for x in _deps)} +deps = {b: a for a, b in (re.findall(r"^(([^!=<>~]+)(?:[!=<>~].*)?$)", x)[0] for x in _deps)} # since we save this data in src/transformers/dependency_versions_table.py it can be easily accessed from # anywhere. If you need to quickly access the data from this table in a shell, you can do so easily with: diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py index 3ba5d71099..17a958664c 100644 --- a/src/transformers/commands/add_new_model_like.py +++ b/src/transformers/commands/add_new_model_like.py @@ -292,7 +292,7 @@ def replace_model_patterns( attributes_to_check.append("model_type") else: text = re.sub( - fr'(\s*)model_type = "{old_model_patterns.model_type}"', + rf'(\s*)model_type = "{old_model_patterns.model_type}"', r'\1model_type = "[MODEL_TYPE]"', text, ) @@ -301,8 +301,8 @@ def replace_model_patterns( # not the new one. We can't just do a replace in all the text and will need a special regex if old_model_patterns.model_upper_cased == old_model_patterns.model_camel_cased: old_model_value = old_model_patterns.model_upper_cased - if re.search(fr"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None: - text = re.sub(fr"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text) + if re.search(rf"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None: + text = re.sub(rf"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text) else: attributes_to_check.append("model_upper_cased") @@ -750,8 +750,8 @@ def clean_frameworks_in_init( return remove_pattern = "|".join(to_remove) - re_conditional_imports = re.compile(fr"^\s*if is_({remove_pattern})_available\(\):\s*$") - re_is_xxx_available = re.compile(fr"is_({remove_pattern})_available") + re_conditional_imports = re.compile(rf"^\s*if is_({remove_pattern})_available\(\):\s*$") + re_is_xxx_available = re.compile(rf"is_({remove_pattern})_available") with open(init_file, "r", encoding="utf-8") as f: content = f.read() @@ -831,7 +831,7 @@ def add_model_to_main_init( if framework is not None and frameworks is not None and framework not in frameworks: new_lines.append(lines[idx]) idx += 1 - elif re.search(fr'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None: + elif re.search(rf'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None: block = [lines[idx]] indent = find_indent(lines[idx]) idx += 1 diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index b1e08fcd64..d62d8d6701 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -3,7 +3,7 @@ # 2. run `make deps_table_update`` deps = { "Pillow": "Pillow", - "black": "black==21.4b0", + "black": "black~=22.0", "codecarbon": "codecarbon==1.2.0", "cookiecutter": "cookiecutter==1.7.2", "dataclasses": "dataclasses", diff --git a/src/transformers/generation_beam_search.py b/src/transformers/generation_beam_search.py index 3c4f259b00..bf84df2c09 100644 --- a/src/transformers/generation_beam_search.py +++ b/src/transformers/generation_beam_search.py @@ -392,6 +392,6 @@ class BeamHypotheses: elif self.early_stopping: return True else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty + cur_score = best_sum_logprobs / cur_len**self.length_penalty ret = self.worst_score >= cur_score return ret diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py index a63c7257db..37015f7610 100644 --- a/src/transformers/generation_flax_utils.py +++ b/src/transformers/generation_flax_utils.py @@ -679,7 +679,7 @@ class FlaxGenerationMixin: not_max_length_yet = state.cur_len < max_length # 2. can the new beams still improve? - best_running_score = state.running_scores[:, -1:] / (max_length ** length_penalty) + best_running_score = state.running_scores[:, -1:] / (max_length**length_penalty) worst_finished_score = jnp.where( state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7) ) @@ -769,7 +769,7 @@ class FlaxGenerationMixin: # - add length penalty # - make sure no scores can be added anymore if beam is full # - make sure still running sequences cannot be chosen as finalized beam - topk_log_probs = topk_log_probs / (state.cur_len ** length_penalty) + topk_log_probs = topk_log_probs / (state.cur_len**length_penalty) beams_in_batch_are_full = ( jnp.broadcast_to(state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape) & early_stopping diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py index d5de492bda..d10b5817a4 100644 --- a/src/transformers/generation_tf_utils.py +++ b/src/transformers/generation_tf_utils.py @@ -1694,6 +1694,6 @@ class BeamHypotheses(object): elif self.early_stopping: return True else: - cur_score = best_sum_logprobs / cur_len ** self.length_penalty + cur_score = best_sum_logprobs / cur_len**self.length_penalty ret = self.worst_score >= cur_score return ret diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 7a53055634..54f465215f 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -1827,7 +1827,7 @@ class TFSharedEmbeddings(tf.keras.layers.Layer): super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size - self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range + self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range def build(self, input_shape): """ diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 70edf96cd2..ca6ea2a7e4 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -146,7 +146,7 @@ class BartAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index bfe27588c5..b9abc647ab 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -152,7 +152,7 @@ class TFBartAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index b41d92d999..a0450fbc20 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -75,10 +75,10 @@ def bytes_to_unicode(): ) cs = bs[:] n = 0 - for b in range(2 ** 8): + for b in range(2**8): if b not in bs: bs.append(b) - cs.append(2 ** 8 + n) + cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index af05b7adc4..2a7e86aa80 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -297,7 +297,7 @@ class BigBirdEmbeddings(nn.Module): inputs_embeds = self.word_embeddings(input_ids) if self.rescale_embeddings: - inputs_embeds = inputs_embeds * (self.hidden_size ** 0.5) + inputs_embeds = inputs_embeds * (self.hidden_size**0.5) token_type_embeddings = self.token_type_embeddings(token_type_ids) diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py index d03244f404..c926c58b60 100644 --- a/src/transformers/models/big_bird/modeling_flax_big_bird.py +++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py @@ -220,7 +220,7 @@ class FlaxBigBirdEmbeddings(nn.Module): token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) if self.config.rescale_embeddings: - inputs_embeds *= self.config.hidden_size ** 0.5 + inputs_embeds *= self.config.hidden_size**0.5 # Sum all embeddings hidden_states = inputs_embeds + token_type_embeddings + position_embeds diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index aa961e0f59..b1e0052b2f 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1219,7 +1219,7 @@ class BigBirdPegasusDecoderAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 7751a74f96..5706c5a4c3 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -148,7 +148,7 @@ class BlenderbotAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index fc70c7c5ab..6d50492062 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -155,7 +155,7 @@ class TFBlenderbotAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index a22c4d0ce6..8ba0012737 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -146,7 +146,7 @@ class BlenderbotSmallAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index ffb057c6ad..fdf0c63c0a 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -154,7 +154,7 @@ class TFBlenderbotSmallAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py index 4846b58aa2..77eb34f929 100644 --- a/src/transformers/models/byt5/tokenization_byt5.py +++ b/src/transformers/models/byt5/tokenization_byt5.py @@ -96,7 +96,7 @@ class ByT5Tokenizer(PreTrainedTokenizer): self._extra_ids = extra_ids - self._utf_vocab_size = 2 ** 8 # utf is 8 bits + self._utf_vocab_size = 2**8 # utf is 8 bits # define special tokens dict self.special_tokens_encoder: Dict[int, str] = { diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 27a3cc859b..079676079c 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -177,7 +177,7 @@ class CLIPAttention(nn.Module): assert ( self.head_dim * self.num_heads == self.embed_dim ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." - self.scale = self.head_dim ** -0.5 + self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) @@ -348,13 +348,13 @@ class CLIPPreTrainedModel(PreTrainedModel): module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) elif isinstance(module, CLIPVisionEmbeddings): factor = self.config.initializer_factor - nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim ** -0.5 * factor) + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) elif isinstance(module, CLIPAttention): factor = self.config.initializer_factor - in_proj_std = (module.embed_dim ** -0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor - out_proj_std = (module.embed_dim ** -0.5) * factor + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor nn.init.normal_(module.q_proj.weight, std=in_proj_std) nn.init.normal_(module.k_proj.weight, std=in_proj_std) nn.init.normal_(module.v_proj.weight, std=in_proj_std) @@ -362,7 +362,7 @@ class CLIPPreTrainedModel(PreTrainedModel): elif isinstance(module, CLIPMLP): factor = self.config.initializer_factor in_proj_std = ( - (module.config.hidden_size ** -0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor ) fc_std = (2 * module.config.hidden_size) ** -0.5 * factor nn.init.normal_(module.fc1.weight, std=fc_std) @@ -370,11 +370,11 @@ class CLIPPreTrainedModel(PreTrainedModel): elif isinstance(module, CLIPModel): nn.init.normal_( module.text_projection.weight, - std=module.text_embed_dim ** -0.5 * self.config.initializer_factor, + std=module.text_embed_dim**-0.5 * self.config.initializer_factor, ) nn.init.normal_( module.visual_projection.weight, - std=module.vision_embed_dim ** -0.5 * self.config.initializer_factor, + std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, ) if isinstance(module, nn.LayerNorm): diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py index 0a0dbeb433..237d29fe71 100644 --- a/src/transformers/models/clip/modeling_flax_clip.py +++ b/src/transformers/models/clip/modeling_flax_clip.py @@ -263,7 +263,7 @@ class FlaxCLIPAttention(nn.Module): assert ( self.head_dim * self.num_heads == self.embed_dim ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." - self.scale = self.head_dim ** -0.5 + self.scale = self.head_dim**-0.5 self.dropout = self.config.attention_dropout self.k_proj = nn.Dense(self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(0.01)) diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index 36ca1bea9e..3a1621ba9d 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -156,7 +156,7 @@ class TFCLIPVisionEmbeddings(tf.keras.layers.Layer): self.class_embedding = self.add_weight( shape=(self.embed_dim,), - initializer=get_initializer(self.embed_dim ** -0.5 * factor), + initializer=get_initializer(self.embed_dim**-0.5 * factor), trainable=True, name="class_embedding", ) @@ -270,8 +270,8 @@ class TFCLIPAttention(tf.keras.layers.Layer): ) factor = config.initializer_factor - in_proj_std = (self.embed_dim ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor - out_proj_std = (self.embed_dim ** -0.5) * factor + in_proj_std = (self.embed_dim**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (self.embed_dim**-0.5) * factor self.sqrt_att_head_size = math.sqrt(self.attention_head_size) @@ -360,7 +360,7 @@ class TFCLIPMLP(tf.keras.layers.Layer): self.activation_fn = get_tf_activation(config.hidden_act) factor = config.initializer_factor - in_proj_std = (config.hidden_size ** -0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor + in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * config.hidden_size) ** -0.5 * factor self.fc1 = tf.keras.layers.Dense( @@ -753,14 +753,14 @@ class TFCLIPMainLayer(tf.keras.layers.Layer): self.visual_projection = tf.keras.layers.Dense( units=self.projection_dim, - kernel_initializer=get_initializer(vision_config.hidden_size ** -0.5 * self.config.initializer_factor), + kernel_initializer=get_initializer(vision_config.hidden_size**-0.5 * self.config.initializer_factor), use_bias=False, name="visual_projection", ) self.text_projection = tf.keras.layers.Dense( units=self.projection_dim, - kernel_initializer=get_initializer(text_config.hidden_size ** -0.5 * self.config.initializer_factor), + kernel_initializer=get_initializer(text_config.hidden_size**-0.5 * self.config.initializer_factor), use_bias=False, name="text_projection", ) diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index c81e5aa4ba..01a2a75500 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -68,10 +68,10 @@ def bytes_to_unicode(): ) cs = bs[:] n = 0 - for b in range(2 ** 8): + for b in range(2**8): if b not in bs: bs.append(b) - cs.append(2 ** 8 + n) + cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 334f07d382..e337d401aa 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -488,7 +488,7 @@ class DetrAttention(nn.Module): assert ( self.head_dim * num_heads == self.embed_dim ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index bb7ba9bd3c..2efc46e6d1 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -823,7 +823,7 @@ class Attention(nn.Module): self.dropout = dropout self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.encoder_decoder_attention = encoder_decoder_attention self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 580ed0752e..f62bff03fc 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -278,7 +278,7 @@ class FunnelAttentionStructure(nn.Module): # Second type pos = pooled_pos - stride = 2 ** block_index + stride = 2**block_index rel_pos = self.relative_pos(pos, stride) rel_pos = rel_pos[:, None] + zero_offset @@ -297,7 +297,7 @@ class FunnelAttentionStructure(nn.Module): # the previous block of the 1st real block. Since the 1st real # block always has position 1, the position of the previous block # will be at `1 - 2 ** block_index`. - cls_pos = pos_id.new_tensor([-(2 ** block_index) + 1]) + cls_pos = pos_id.new_tensor([-(2**block_index) + 1]) pooled_pos_id = pos_id[1:-1] if self.config.truncate_seq else pos_id[1:] return torch.cat([cls_pos, pooled_pos_id[::2]], 0) else: @@ -454,7 +454,7 @@ class FunnelRelMultiheadAttention(nn.Module): self.post_proj = nn.Linear(n_head * d_head, d_model) self.layer_norm = nn.LayerNorm(d_model, eps=config.layer_norm_eps) - self.scale = 1.0 / (d_head ** 0.5) + self.scale = 1.0 / (d_head**0.5) def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None): """Relative attention score for the positional encodings""" diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index b72040801e..b3d9a8506e 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -231,7 +231,7 @@ class TFFunnelAttentionStructure: # Second type pos = pooled_pos - stride = 2 ** block_index + stride = 2**block_index rel_pos = self.relative_pos(pos, stride) # rel_pos = tf.expand_dims(rel_pos,1) + zero_offset @@ -252,7 +252,7 @@ class TFFunnelAttentionStructure: # the previous block of the 1st real block. Since the 1st real # block always has position 1, the position of the previous block # will be at `1 - 2 ** block_index`. - cls_pos = tf.constant([-(2 ** block_index) + 1], dtype=pos_id.dtype) + cls_pos = tf.constant([-(2**block_index) + 1], dtype=pos_id.dtype) pooled_pos_id = pos_id[1:-1] if self.truncate_seq else pos_id[1:] return tf.concat([cls_pos, pooled_pos_id[::2]], 0) else: @@ -400,7 +400,7 @@ class TFFunnelRelMultiheadAttention(tf.keras.layers.Layer): self.post_proj = tf.keras.layers.Dense(d_model, kernel_initializer=initializer, name="post_proj") self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") - self.scale = 1.0 / (d_head ** 0.5) + self.scale = 1.0 / (d_head**0.5) def build(self, input_shape): n_head, d_head, d_model = self.n_head, self.d_head, self.d_model diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index 0c4cf69f77..6a6f49b1f9 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -78,10 +78,10 @@ def bytes_to_unicode(): ) cs = bs[:] n = 0 - for b in range(2 ** 8): + for b in range(2**8): if b not in bs: bs.append(b) - cs.append(2 ** 8 + n) + cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 928a742dee..9714ddbf5f 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -418,7 +418,7 @@ class HubertAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index a3a34b5e7f..4261054ef5 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -741,7 +741,7 @@ class TFHubertAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py index 66990d5b11..e6eab6ce62 100644 --- a/src/transformers/models/ibert/quant_modules.py +++ b/src/transformers/models/ibert/quant_modules.py @@ -327,16 +327,16 @@ class IntGELU(nn.Module): def int_erf(self, x_int, scaling_factor): b_int = torch.floor(self.coeff[1] / scaling_factor) - c_int = torch.floor(self.coeff[2] / scaling_factor ** 2) + c_int = torch.floor(self.coeff[2] / scaling_factor**2) sign = torch.sign(x_int) abs_int = torch.min(torch.abs(x_int), -b_int) y_int = sign * ((abs_int + b_int) ** 2 + c_int) - scaling_factor = scaling_factor ** 2 * self.coeff[0] + scaling_factor = scaling_factor**2 * self.coeff[0] # avoid overflow - y_int = floor_ste.apply(y_int / 2 ** self.const) - scaling_factor = scaling_factor * 2 ** self.const + y_int = floor_ste.apply(y_int / 2**self.const) + scaling_factor = scaling_factor * 2**self.const return y_int, scaling_factor @@ -388,9 +388,9 @@ class IntSoftmax(nn.Module): def int_polynomial(self, x_int, scaling_factor): with torch.no_grad(): b_int = torch.floor(self.coef[1] / scaling_factor) - c_int = torch.floor(self.coef[2] / scaling_factor ** 2) + c_int = torch.floor(self.coef[2] / scaling_factor**2) z = (x_int + b_int) * x_int + c_int - scaling_factor = self.coef[0] * scaling_factor ** 2 + scaling_factor = self.coef[0] * scaling_factor**2 return z, scaling_factor def int_exp(self, x_int, scaling_factor): @@ -402,7 +402,7 @@ class IntSoftmax(nn.Module): r = x_int - x0_int * q exp_int, exp_scaling_factor = self.int_polynomial(r, scaling_factor) exp_int = torch.clamp(floor_ste.apply(exp_int * 2 ** (self.const - q)), min=0) - scaling_factor = exp_scaling_factor / 2 ** self.const + scaling_factor = exp_scaling_factor / 2**self.const return exp_int, scaling_factor def forward(self, x, scaling_factor): @@ -420,9 +420,9 @@ class IntSoftmax(nn.Module): exp_int = exp / exp_scaling_factor exp_int_sum = exp_int.sum(dim=-1, keepdim=True) - factor = floor_ste.apply(2 ** self.max_bit / exp_int_sum) + factor = floor_ste.apply(2**self.max_bit / exp_int_sum) exp_int = floor_ste.apply(exp_int * factor / 2 ** (self.max_bit - self.output_bit)) - scaling_factor = 1 / 2 ** self.output_bit + scaling_factor = 1 / 2**self.output_bit return exp_int * scaling_factor, scaling_factor @@ -460,9 +460,9 @@ class IntLayerNorm(nn.Module): def set_shift(self, y_int): with torch.no_grad(): - y_sq_int = y_int ** 2 + y_sq_int = y_int**2 var_int = torch.sum(y_sq_int, axis=2, keepdim=True) - shift = (torch.log2(torch.sqrt(var_int / 2 ** self.max_bit)).ceil()).max() + shift = (torch.log2(torch.sqrt(var_int / 2**self.max_bit)).ceil()).max() shift_old = self.shift self.shift = torch.max(self.shift, shift) logger.info(f"Dynamic shift adjustment: {int(shift_old)} -> {int(self.shift)}") @@ -473,8 +473,8 @@ class IntLayerNorm(nn.Module): to avoid overflow in the subsequent runs. """ self.set_shift(y_int) # adjusts `self.shift` - y_int_shifted = floor_ste.apply(y_int / 2 ** self.shift) - y_sq_int = y_int_shifted ** 2 + y_int_shifted = floor_ste.apply(y_int / 2**self.shift) + y_sq_int = y_int_shifted**2 var_int = torch.sum(y_sq_int, axis=2, keepdim=True) return var_int @@ -482,7 +482,7 @@ class IntLayerNorm(nn.Module): if not self.quant_mode: mean = x.mean(axis=2, keepdim=True) y = x - mean - var = torch.mean(y ** 2, axis=2, keepdim=True) + var = torch.mean(y**2, axis=2, keepdim=True) x = y / torch.sqrt(self.eps + var) x = x * self.weight + self.bias return x, None @@ -496,25 +496,25 @@ class IntLayerNorm(nn.Module): x_int = x / scaling_factor mean_int = round_ste.apply(x_int.mean(axis=2, keepdim=True)) y_int = x_int - mean_int - y_int_shifted = floor_ste.apply(y_int / 2 ** self.shift) - y_sq_int = y_int_shifted ** 2 + y_int_shifted = floor_ste.apply(y_int / 2**self.shift) + y_sq_int = y_int_shifted**2 var_int = torch.sum(y_sq_int, axis=2, keepdim=True) # overflow handling in training time if self.training: # if overflow is detected - if var_int.max() >= 2 ** self.max_bit: + if var_int.max() >= 2**self.max_bit: var_int = self.overflow_fallback(y_int) - assert var_int.max() < 2 ** self.max_bit + 0.1, ( + assert var_int.max() < 2**self.max_bit + 0.1, ( "Error detected in overflow handling: " "`var_int` exceeds `self.max_bit` (the maximum possible bit width)" ) # To be replaced with integer-sqrt kernel that produces the same output - std_int = floor_ste.apply(torch.sqrt(var_int)) * 2 ** self.shift - factor = floor_ste.apply(2 ** 31 / std_int) + std_int = floor_ste.apply(torch.sqrt(var_int)) * 2**self.shift + factor = floor_ste.apply(2**31 / std_int) y_int = floor_ste.apply(y_int * factor / 2) - scaling_factor = self.dim_sqrt / 2 ** 30 + scaling_factor = self.dim_sqrt / 2**30 # scaling and shifting bias = self.bias.data.detach() / (self.weight.data.detach()) @@ -725,7 +725,7 @@ def batch_frexp(inputs, max_bit=31): tmp_m = [] for m in output_m: int_m_shifted = int( - decimal.Decimal(m * (2 ** max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP) + decimal.Decimal(m * (2**max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP) ) tmp_m.append(int_m_shifted) output_m = np.array(tmp_m) @@ -796,7 +796,7 @@ class FixedPointMul(Function): m, e = batch_frexp(new_scale) output = z_int.type(torch.double) * m.type(torch.double) - output = torch.round(output / (2.0 ** e)) + output = torch.round(output / (2.0**e)) if identity is not None: # needs addition of identity activation @@ -809,7 +809,7 @@ class FixedPointMul(Function): m1, e1 = batch_frexp(new_scale) output1 = wx_int.type(torch.double) * m1.type(torch.double) - output1 = torch.round(output1 / (2.0 ** e1)) + output1 = torch.round(output1 / (2.0**e1)) output = output1 + output diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index e775fd35c9..7cc926275b 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -766,7 +766,7 @@ class LEDDecoderAttention(nn.Module): assert ( self.head_dim * num_heads == self.embed_dim ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index f273148ac9..2064a50431 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -998,7 +998,7 @@ class TFLEDDecoderAttention(tf.keras.layers.Layer): self.dropout = tf.keras.layers.Dropout(dropout) self.head_dim = embed_dim // num_heads assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index cf661a6081..da34d11b80 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -405,13 +405,10 @@ def _compute_global_attention_mask(input_ids_shape, sep_token_indices, before_se else: # last token is separation token and should not be counted and in the middle are two separation tokens question_end_index = tf.tile(question_end_index + 1, (1, input_ids_shape[1])) - attention_mask = ( - tf.cast( - attention_mask > question_end_index, - dtype=question_end_index.dtype, - ) - * tf.cast(attention_mask < input_ids_shape[-1], dtype=question_end_index.dtype) - ) + attention_mask = tf.cast( + attention_mask > question_end_index, + dtype=question_end_index.dtype, + ) * tf.cast(attention_mask < input_ids_shape[-1], dtype=question_end_index.dtype) return attention_mask diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index cb5bbaea9a..569d5f2dc0 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -217,7 +217,7 @@ class M2M100Attention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 20cbd21f76..3d1e5bd85a 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -163,7 +163,7 @@ class MarianAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index dad78bb3f0..be9be08fb1 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -194,7 +194,7 @@ class TFMarianAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 3e747b4b1e..91a0289a6c 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -152,7 +152,7 @@ class MBartAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index 91f9c003de..f98408f8e1 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -154,7 +154,7 @@ class TFMBartAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 5eed41254e..ad6f74e00f 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -163,7 +163,7 @@ class PegasusAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index 8539546836..86f922e7bb 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -195,7 +195,7 @@ class TFPegasusAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py index f372d09a3e..8668315141 100755 --- a/src/transformers/models/perceiver/modeling_perceiver.py +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -824,7 +824,7 @@ class PerceiverModel(PerceiverPreTrainedModel): ... project_pos_dim=256, ... trainable_position_encoding_kwargs=dict( ... num_channels=256, - ... index_dims=config.image_size ** 2, + ... index_dims=config.image_size**2, ... ), ... ) @@ -1205,7 +1205,7 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel): def __init__(self, config): super().__init__(config) - trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size ** 2) + trainable_position_encoding_kwargs_preprocessor = dict(num_channels=256, index_dims=config.image_size**2) trainable_position_encoding_kwargs_decoder = dict(num_channels=config.d_latents, index_dims=1) self.num_labels = config.num_labels @@ -2485,7 +2485,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b batch_size, height // spatial_block_size, width // spatial_block_size, - (spatial_block_size ** 2) * num_channels, + (spatial_block_size**2) * num_channels, ) return frames elif len(frames.shape) == 5: @@ -2509,7 +2509,7 @@ def space_to_depth(frames: torch.Tensor, temporal_block_size: int = 1, spatial_b time // temporal_block_size, height // spatial_block_size, width // spatial_block_size, - temporal_block_size * (spatial_block_size ** 2) * num_channels, + temporal_block_size * (spatial_block_size**2) * num_channels, ) return frames else: @@ -3059,7 +3059,7 @@ class PerceiverImagePreprocessor(AbstractPreprocessor): if self.conv_after_patching: inp_dim = self.out_channels else: - inp_dim = self.in_channels * self.spatial_downsample ** 2 + inp_dim = self.in_channels * self.spatial_downsample**2 if is_temporal: inp_dim *= self.temporal_downsample diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py index 644ec58ef7..958d8a9c1d 100644 --- a/src/transformers/models/perceiver/tokenization_perceiver.py +++ b/src/transformers/models/perceiver/tokenization_perceiver.py @@ -87,7 +87,7 @@ class PerceiverTokenizer(PreTrainedTokenizer): **kwargs, ) - self._utf_vocab_size = 2 ** 8 # utf is 8 bits + self._utf_vocab_size = 2**8 # utf is 8 bits # define special tokens dict self.special_tokens_encoder: Dict[str, int] = { diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index 3ca88952c5..ee3edebe06 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -674,7 +674,7 @@ class ProphetNetAttention(nn.Module): ], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}" # previous time steps are cached - no need to recompute key and value if they are static - query_states = self.query_proj(hidden_states) / (self.head_dim ** 0.5) + query_states = self.query_proj(hidden_states) / (self.head_dim**0.5) if is_cross_attention and past_key_value is not None: # reuse k,v, cross_attentions @@ -855,7 +855,7 @@ class ProphetNetNgramSelfAttention(nn.Module): value_states = self.value_proj(hidden_states) # normalize - query_states = query_states / (self.head_dim ** 0.5) + query_states = query_states / (self.head_dim**0.5) # reshape query_states = self._shape(query_states, ngram_sequence_length, batch_size) diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index 3f42d74ca2..56083bfd21 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -700,7 +700,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): # `num_buckets` should be set to 2 * sequence_length // chunk_length as recommended in paper num_buckets_pow_2 = (2 * (sequence_length // self.chunk_length)).bit_length() - 1 # make sure buckets are power of 2 - num_buckets = 2 ** num_buckets_pow_2 + num_buckets = 2**num_buckets_pow_2 # factorize `num_buckets` if `num_buckets` becomes too large num_buckets_limit = 2 * max( @@ -966,7 +966,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin): """ length normalization """ - variance = torch.mean(x ** 2, -1, keepdim=True) + variance = torch.mean(x**2, -1, keepdim=True) norm_x = x * torch.rsqrt(variance + epsilon) return norm_x diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py index a48e1520c8..0d87615c15 100644 --- a/src/transformers/models/roberta/tokenization_roberta.py +++ b/src/transformers/models/roberta/tokenization_roberta.py @@ -77,10 +77,10 @@ def bytes_to_unicode(): ) cs = bs[:] n = 0 - for b in range(2 ** 8): + for b in range(2**8): if b not in bs: bs.append(b) - cs.append(2 ** 8 + n) + cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 6fd0a1861f..b0a0a1d7d7 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -420,7 +420,7 @@ class SEWAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py index 8b621af685..0f0dd8a123 100644 --- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py +++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py @@ -86,7 +86,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor): Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs and hence the waveform should not be normalized before feature extraction. """ - waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers + waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers waveform = torch.from_numpy(waveform).unsqueeze(0) features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate) return features.numpy() diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 85ef5651df..fe09834cc6 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -230,7 +230,7 @@ class Speech2TextAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index a6876bf9ee..7c69684e06 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -256,7 +256,7 @@ class TFSpeech2TextAttention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index 4effd1354a..0757099699 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -170,7 +170,7 @@ class Speech2Text2Attention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 8655371525..2266750647 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -544,8 +544,8 @@ class SwinEncoder(nn.Module): [ SwinLayer( config=config, - dim=int(config.embed_dim * 2 ** i_layer), - input_resolution=(grid_size[0] // (2 ** i_layer), grid_size[1] // (2 ** i_layer)), + dim=int(config.embed_dim * 2**i_layer), + input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)), depth=config.depths[i_layer], num_heads=config.num_heads[i_layer], drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py index 015c698f42..632c41b319 100644 --- a/src/transformers/models/t5/modeling_flax_t5.py +++ b/src/transformers/models/t5/modeling_flax_t5.py @@ -92,8 +92,8 @@ class FlaxT5DenseReluDense(nn.Module): dtype: jnp.dtype = jnp.float32 def setup(self): - wi_init_std = self.config.initializer_factor * (self.config.d_model ** -0.5) - wo_init_std = self.config.initializer_factor * (self.config.d_ff ** -0.5) + wi_init_std = self.config.initializer_factor * (self.config.d_model**-0.5) + wo_init_std = self.config.initializer_factor * (self.config.d_ff**-0.5) self.wi = nn.Dense( self.config.d_ff, @@ -122,8 +122,8 @@ class FlaxT5DenseGatedGeluDense(nn.Module): dtype: jnp.dtype = jnp.float32 # the dtype of the computation def setup(self): - wi_init_std = self.config.initializer_factor * (self.config.d_model ** -0.5) - wo_init_std = self.config.initializer_factor * (self.config.d_ff ** -0.5) + wi_init_std = self.config.initializer_factor * (self.config.d_model**-0.5) + wo_init_std = self.config.initializer_factor * (self.config.d_ff**-0.5) self.wi_0 = nn.Dense( self.config.d_ff, @@ -194,8 +194,8 @@ class FlaxT5Attention(nn.Module): self.inner_dim = self.n_heads * self.key_value_proj_dim q_init_std = self.config.initializer_factor * ((self.inner_dim * self.key_value_proj_dim) ** -0.5) - kv_init_std = self.config.initializer_factor * (self.inner_dim ** -0.5) - o_init_std = self.config.initializer_factor * (self.inner_dim ** -0.5) + kv_init_std = self.config.initializer_factor * (self.inner_dim**-0.5) + o_init_std = self.config.initializer_factor * (self.inner_dim**-0.5) self.q = nn.Dense( self.inner_dim, @@ -1434,7 +1434,7 @@ class FlaxT5ForConditionalGenerationModule(nn.Module): if self.config.tie_word_embeddings: # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 - sequence_output = sequence_output * (self.model_dim ** -0.5) + sequence_output = sequence_output * (self.model_dim**-0.5) if self.config.tie_word_embeddings: shared_embedding = self.shared.variables["params"]["embedding"] @@ -1542,7 +1542,7 @@ class FlaxT5ForConditionalGeneration(FlaxT5PreTrainedModel): if self.config.tie_word_embeddings: # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 - sequence_output = sequence_output * (self.config.d_model ** -0.5) + sequence_output = sequence_output * (self.config.d_model**-0.5) if self.config.tie_word_embeddings: shared_embedding = module.shared.variables["params"]["embedding"] diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 58fed6e453..3af2a53de2 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -771,8 +771,8 @@ class T5PreTrainedModel(PreTrainedModel): key_value_proj_dim = self.config.d_kv n_heads = self.config.num_heads module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5)) - module.k.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) - module.v.weight.data.normal_(mean=0.0, std=factor * (d_model ** -0.5)) + module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5)) module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5)) if module.has_relative_attention_bias: module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5)) @@ -1639,7 +1639,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): if self.config.tie_word_embeddings: # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 - sequence_output = sequence_output * (self.model_dim ** -0.5) + sequence_output = sequence_output * (self.model_dim**-0.5) lm_logits = self.lm_head(sequence_output) diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index 372f6cf132..c50c8d13bc 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -94,10 +94,10 @@ class TFT5DenseReluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) wi_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (config.d_model ** -0.5) + mean=0, stddev=config.initializer_factor * (config.d_model**-0.5) ) wo_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (config.d_ff ** -0.5) + mean=0, stddev=config.initializer_factor * (config.d_ff**-0.5) ) self.wi = tf.keras.layers.Dense( config.d_ff, use_bias=False, name="wi", kernel_initializer=wi_initializer @@ -120,10 +120,10 @@ class TFT5GatedGeluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) wi_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (config.d_model ** -0.5) + mean=0, stddev=config.initializer_factor * (config.d_model**-0.5) ) wo_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (config.d_ff ** -0.5) + mean=0, stddev=config.initializer_factor * (config.d_ff**-0.5) ) self.wi_0 = tf.keras.layers.Dense( config.d_ff, use_bias=False, name="wi_0", kernel_initializer=wi_initializer @@ -189,16 +189,16 @@ class TFT5Attention(tf.keras.layers.Layer): mean=0, stddev=config.initializer_factor * ((self.inner_dim * self.key_value_proj_dim) ** -0.5) ) k_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (self.inner_dim ** -0.5) + mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) v_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (self.inner_dim ** -0.5) + mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) o_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (self.inner_dim ** -0.5) + mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) self.relative_attention_bias_initializer = tf.keras.initializers.RandomNormal( - mean=0, stddev=config.initializer_factor * (self.inner_dim ** -0.5) + mean=0, stddev=config.initializer_factor * (self.inner_dim**-0.5) ) self.q = tf.keras.layers.Dense( @@ -1472,7 +1472,7 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling # T5v1.1 does not tie output word embeddings and thus does not require downscaling if self.config.tie_word_embeddings: - sequence_output = sequence_output * (self.model_dim ** -0.5) + sequence_output = sequence_output * (self.model_dim**-0.5) logits = self.shared(sequence_output, mode="linear") else: logits = self.lm_head(sequence_output) diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 3d8ec1bc5e..27d4cf1784 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -2365,7 +2365,7 @@ def _calculate_expected_result( # PyTorch does not currently support Huber loss with custom delta so we define it ourself def huber_loss(input, target, delta: float = 1.0): errors = torch.abs(input - target) # shape (batch_size,) - return torch.where(errors < delta, 0.5 * errors ** 2, errors * delta - (0.5 * delta ** 2)) + return torch.where(errors < delta, 0.5 * errors**2, errors * delta - (0.5 * delta**2)) def _calculate_regression_loss( diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py index f3230f03aa..ab8fb6f11b 100644 --- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py @@ -149,7 +149,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm") - self.scale = 1 / (d_head ** 0.5) + self.scale = 1 / (d_head**0.5) self.pre_lnorm = pre_lnorm @@ -350,7 +350,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): self.div_val = div_val self.d_proj = d_proj - self.emb_scale = d_proj ** 0.5 + self.emb_scale = d_proj**0.5 self.cutoff_ends = [0] + self.cutoffs @@ -362,7 +362,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] - d_emb_i = d_embed // (div_val ** i) + d_emb_i = d_embed // (div_val**i) self.emb_layers.append( TFTransfoEmbeddings( r_idx - l_idx, @@ -374,7 +374,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): def build(self, input_shape): for i in range(len(self.cutoffs)): - d_emb_i = self.d_embed // (self.div_val ** i) + d_emb_i = self.d_embed // (self.div_val**i) self.emb_projs.append( self.add_weight( shape=(d_emb_i, self.d_proj), diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py index 699e278583..53eb8239a5 100644 --- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py +++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py @@ -80,7 +80,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] - d_emb_i = self.d_embed // (self.div_val ** i) + d_emb_i = self.d_embed // (self.div_val**i) weight = self.add_weight( shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name=f"out_projs_._{i}" diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index 25823dafac..6ba9903b9c 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -259,7 +259,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) - self.scale = 1 / (d_head ** 0.5) + self.scale = 1 / (d_head**0.5) self.pre_lnorm = pre_lnorm @@ -412,7 +412,7 @@ class AdaptiveEmbedding(nn.Module): self.div_val = div_val self.d_proj = d_proj - self.emb_scale = d_proj ** 0.5 + self.emb_scale = d_proj**0.5 self.cutoff_ends = [0] + self.cutoffs @@ -425,7 +425,7 @@ class AdaptiveEmbedding(nn.Module): else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] - d_emb_i = d_embed // (div_val ** i) + d_emb_i = d_embed // (div_val**i) self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i)) self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py index dad3183a9b..b25dc2d707 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py @@ -60,7 +60,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module): else: for i in range(len(self.cutoffs)): l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] - d_emb_i = d_embed // (div_val ** i) + d_emb_i = d_embed // (div_val**i) self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 830fd14638..9b63cf393a 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -185,7 +185,7 @@ class TrOCRAttention(nn.Module): raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias) diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 768fbfc609..33ba5d9685 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -484,7 +484,7 @@ class UniSpeechAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index ddc98c8001..028387fec0 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -523,7 +523,7 @@ class UniSpeechSatAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 1eef6111b8..53085ac55b 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -192,7 +192,7 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): omega = np.arange(embed_dim // 2, dtype=np.float) omega /= embed_dim / 2.0 - omega = 1.0 / 10000 ** omega # (D/2,) + omega = 1.0 / 10000**omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product @@ -231,7 +231,7 @@ class ViTMAEEmbeddings(nn.Module): def initialize_weights(self): # initialize (and freeze) position embeddings by sin-cos embedding pos_embed = get_2d_sincos_pos_embed( - self.position_embeddings.shape[-1], int(self.patch_embeddings.num_patches ** 0.5), add_cls_token=True + self.position_embeddings.shape[-1], int(self.patch_embeddings.num_patches**0.5), add_cls_token=True ) self.position_embeddings.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) @@ -741,7 +741,7 @@ class ViTMAEDecoder(nn.Module): self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size) self.decoder_pred = nn.Linear( - config.decoder_hidden_size, config.patch_size ** 2 * config.num_channels, bias=True + config.decoder_hidden_size, config.patch_size**2 * config.num_channels, bias=True ) # encoder to decoder self.gradient_checkpointing = False self.config = config @@ -750,7 +750,7 @@ class ViTMAEDecoder(nn.Module): def initialize_weights(self, num_patches): # initialize (and freeze) position embeddings by sin-cos embedding decoder_pos_embed = get_2d_sincos_pos_embed( - self.decoder_pos_embed.shape[-1], int(num_patches ** 0.5), add_cls_token=True + self.decoder_pos_embed.shape[-1], int(num_patches**0.5), add_cls_token=True ) self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0)) @@ -861,7 +861,7 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel): h = w = imgs.shape[2] // p x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) x = torch.einsum("nchpwq->nhwpqc", x) - x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3)) + x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) return x def unpatchify(self, x): diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index a493ee1ebf..f31e0e0134 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -770,7 +770,7 @@ class TFWav2Vec2Attention(tf.keras.layers.Layer): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = tf.keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index f45c0a8e68..cdbe9fd13c 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -566,7 +566,7 @@ class Wav2Vec2Attention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 2618ec9bbc..08e789b700 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -486,7 +486,7 @@ class WavLMAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.k_proj = nn.Linear(embed_dim, embed_dim) self.v_proj = nn.Linear(embed_dim, embed_dim) diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index c7f14ed9d4..2d7f748b4e 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -261,7 +261,7 @@ class XGLMAttention(nn.Module): f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" f" and `num_heads`: {num_heads})." ) - self.scaling = self.head_dim ** -0.5 + self.scaling = self.head_dim**-0.5 self.is_decoder = is_decoder self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py index a84a11002f..d6f70c6671 100644 --- a/src/transformers/models/xlm/configuration_xlm.py +++ b/src/transformers/models/xlm/configuration_xlm.py @@ -169,7 +169,7 @@ class XLMConfig(PretrainedConfig): n_langs=1, use_lang_emb=True, max_position_embeddings=512, - embed_init_std=2048 ** -0.5, + embed_init_std=2048**-0.5, layer_norm_eps=1e-12, init_std=0.02, bos_index=0, diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index d680427d7a..c31b82d786 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -76,7 +76,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): self.n_head = config.n_head self.d_head = config.d_head self.d_model = config.d_model - self.scale = 1 / (config.d_head ** 0.5) + self.scale = 1 / (config.d_head**0.5) self.initializer_range = config.initializer_range self.output_attentions = config.output_attentions diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index 97c86e1eb1..c212e2979d 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -220,7 +220,7 @@ class XLNetRelativeAttention(nn.Module): self.n_head = config.n_head self.d_head = config.d_head self.d_model = config.d_model - self.scale = 1 / (config.d_head ** 0.5) + self.scale = 1 / (config.d_head**0.5) self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index d9fd5def82..a4d907a945 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -175,7 +175,7 @@ class YosoLSHCumulation(torch.autograd.Function): use_cuda = query_mask.is_cuda num_hash = config["num_hash"] hash_code_len = config["hash_code_len"] - hashtable_capacity = int(2 ** hash_code_len) + hashtable_capacity = int(2**hash_code_len) if config["use_fast_hash"]: query_hash_code, key_hash_code = lsh_cumulation.fast_hash( @@ -202,7 +202,7 @@ class YosoLSHCumulation(torch.autograd.Function): use_cuda = grad.is_cuda hash_code_len = config["hash_code_len"] - hashtable_capacity = int(2 ** hash_code_len) + hashtable_capacity = int(2**hash_code_len) if config["lsh_backward"]: grad_value = lsh_cumulation.lsh_cumulation( diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 269e767e93..4d368cabf0 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -214,7 +214,7 @@ def get_polynomial_decay_schedule_with_warmup( lr_range = lr_init - lr_end decay_steps = num_training_steps - num_warmup_steps pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps - decay = lr_range * pct_remaining ** power + lr_end + decay = lr_range * pct_remaining**power + lr_end return decay / lr_init # as LambdaLR multiplies by lr_init return LambdaLR(optimizer, lr_lambda, last_epoch) @@ -586,7 +586,7 @@ class Adafactor(Optimizer): lr = self._get_lr(group, state) beta2t = 1.0 - math.pow(state["step"], group["decay_rate"]) - update = (grad ** 2) + group["eps"][0] + update = (grad**2) + group["eps"][0] if factored: exp_avg_sq_row = state["exp_avg_sq_row"] exp_avg_sq_col = state["exp_avg_sq_col"] diff --git a/src/transformers/pipelines/audio_utils.py b/src/transformers/pipelines/audio_utils.py index 9673705847..d967256bb6 100644 --- a/src/transformers/pipelines/audio_utils.py +++ b/src/transformers/pipelines/audio_utils.py @@ -204,7 +204,7 @@ def _ffmpeg_stream(ffmpeg_command, buflen: int): """ Internal function to create the generator of data through ffmpeg """ - bufsize = 2 ** 24 # 16Mo + bufsize = 2**24 # 16Mo try: with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process: while True: diff --git a/tests/test_generation_beam_search.py b/tests/test_generation_beam_search.py index fdbe35eafa..11cb8fadeb 100644 --- a/tests/test_generation_beam_search.py +++ b/tests/test_generation_beam_search.py @@ -102,7 +102,7 @@ class BeamSearchTester: beam_hyp.add(input_ids[beam_idx], -10.0 + float(beam_idx)) # -10.0 is removed => -9.0 is worst score - self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length ** beam_hyp.length_penalty)) + self.parent.assertAlmostEqual(beam_hyp.worst_score, -9.0 / (self.sequence_length**beam_hyp.length_penalty)) # -5.0 is better than worst score => should not be finished self.parent.assertFalse(beam_hyp.is_done(-5.0, self.sequence_length)) diff --git a/tests/test_modeling_ibert.py b/tests/test_modeling_ibert.py index 5143090069..fed2a80d65 100755 --- a/tests/test_modeling_ibert.py +++ b/tests/test_modeling_ibert.py @@ -544,7 +544,7 @@ class IBertModelIntegrationTest(unittest.TestCase): self.assertTrue(torch.allclose(q_int, q_int.round(), atol=1e-4)) # Output of the quantize Softmax should not exceed the output_bit - self.assertTrue(q.abs().max() < 2 ** output_bit) + self.assertTrue(q.abs().max() < 2**output_bit) array = [[i + j for j in range(10)] for i in range(-10, 10)] _test(array) diff --git a/tests/test_modeling_swin.py b/tests/test_modeling_swin.py index 29eddb7f7c..173f7ffc3a 100644 --- a/tests/test_modeling_swin.py +++ b/tests/test_modeling_swin.py @@ -252,7 +252,7 @@ class SwinModelTest(ModelTesterMixin, unittest.TestCase): # check that output_attentions also work using config del inputs_dict["output_attentions"] config.output_attentions = True - window_size_squared = config.window_size ** 2 + window_size_squared = config.window_size**2 model = model_class(config) model.to(torch_device) model.eval() diff --git a/tests/test_modeling_vit_mae.py b/tests/test_modeling_vit_mae.py index 9cf9fa2759..3c428938b7 100644 --- a/tests/test_modeling_vit_mae.py +++ b/tests/test_modeling_vit_mae.py @@ -134,7 +134,7 @@ class ViTMAEModelTester: patch_size = to_2tuple(self.patch_size) num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) expected_seq_len = num_patches - expected_num_channels = self.patch_size ** 2 * self.num_channels + expected_num_channels = self.patch_size**2 * self.num_channels self.parent.assertEqual(result.logits.shape, (self.batch_size, expected_seq_len, expected_num_channels)) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_utils_check_copies.py b/tests/test_utils_check_copies.py index 4082335ad2..5151c3a490 100644 --- a/tests/test_utils_check_copies.py +++ b/tests/test_utils_check_copies.py @@ -68,7 +68,8 @@ class CopyCheckTester(unittest.TestCase): code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code if overwrite_result is not None: expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result - code = black.format_str(code, mode=black.FileMode([black.TargetVersion.PY35], line_length=119)) + mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119) + code = black.format_str(code, mode=mode) fname = os.path.join(self.transformer_dir, "new_code.py") with open(fname, "w", newline="\n") as f: f.write(code) diff --git a/utils/check_copies.py b/utils/check_copies.py index ac9ab15973..dffb887d2e 100644 --- a/utils/check_copies.py +++ b/utils/check_copies.py @@ -88,7 +88,7 @@ def find_code_in_transformers(object_name): line_index = 0 for name in parts[i + 1 :]: while ( - line_index < len(lines) and re.search(fr"^{indent}(class|def)\s+{name}(\(|\:)", lines[line_index]) is None + line_index < len(lines) and re.search(rf"^{indent}(class|def)\s+{name}(\(|\:)", lines[line_index]) is None ): line_index += 1 indent += " " @@ -130,7 +130,8 @@ def blackify(code): has_indent = len(get_indent(code)) > 0 if has_indent: code = f"class Bla:\n{code}" - result = black.format_str(code, mode=black.FileMode([black.TargetVersion.PY35], line_length=119)) + mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119) + result = black.format_str(code, mode=mode) result, _ = style_docstrings_in_code(result) return result[len("class Bla:\n") :] if has_indent else result diff --git a/utils/get_modified_files.py b/utils/get_modified_files.py index c3d9327549..4227d1d5d2 100644 --- a/utils/get_modified_files.py +++ b/utils/get_modified_files.py @@ -28,7 +28,7 @@ fork_point_sha = subprocess.check_output("git merge-base master HEAD".split()).d modified_files = subprocess.check_output(f"git diff --name-only {fork_point_sha}".split()).decode("utf-8").split() joined_dirs = "|".join(sys.argv[1:]) -regex = re.compile(fr"^({joined_dirs}).*?\.py$") +regex = re.compile(rf"^({joined_dirs}).*?\.py$") relevant_modified_files = [x for x in modified_files if regex.match(x)] print(" ".join(relevant_modified_files), end="") diff --git a/utils/style_doc.py b/utils/style_doc.py index 1e8aecddc6..dc021446f5 100644 --- a/utils/style_doc.py +++ b/utils/style_doc.py @@ -147,9 +147,8 @@ def format_code_example(code: str, max_len: int, in_docstring: bool = False): for k, v in BLACK_AVOID_PATTERNS.items(): full_code = full_code.replace(k, v) try: - formatted_code = black.format_str( - full_code, mode=black.FileMode([black.TargetVersion.PY37], line_length=line_length) - ) + mode = black.Mode(target_versions={black.TargetVersion.PY37}, line_length=line_length) + formatted_code = black.format_str(full_code, mode=mode) error = "" except Exception as e: formatted_code = full_code