From 113424bcd53b92600f77d82f48add0a60fb41556 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 20 May 2025 18:11:22 +0200 Subject: [PATCH] Release: v4.52.0 --- examples/flax/question-answering/run_qa.py | 2 +- .../run_flax_speech_recognition_seq2seq.py | 2 +- .../flax/text-classification/run_flax_glue.py | 2 +- .../flax/token-classification/run_flax_ner.py | 2 +- .../run_audio_classification.py | 2 +- .../contrastive-image-text/run_clip.py | 2 +- .../run_image_classification.py | 2 +- .../run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- .../image-pretraining/run_mim_no_trainer.py | 2 +- .../run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- .../language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- .../language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- .../language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- .../multiple-choice/run_swag_no_trainer.py | 2 +- .../object-detection/run_object_detection.py | 2 +- .../run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- .../question-answering/run_qa_beam_search.py | 2 +- .../run_qa_beam_search_no_trainer.py | 2 +- .../question-answering/run_qa_no_trainer.py | 2 +- .../question-answering/run_seq2seq_qa.py | 2 +- .../run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../run_speech_recognition_ctc.py | 2 +- .../run_speech_recognition_ctc_adapter.py | 2 +- .../run_speech_recognition_seq2seq.py | 2 +- .../summarization/run_summarization.py | 2 +- .../run_summarization_no_trainer.py | 2 +- .../text-classification/run_classification.py | 2 +- .../pytorch/text-classification/run_glue.py | 2 +- .../run_glue_no_trainer.py | 2 +- .../pytorch/text-classification/run_xnli.py | 2 +- .../pytorch/token-classification/run_ner.py | 2 +- .../run_ner_no_trainer.py | 2 +- .../pytorch/translation/run_translation.py | 2 +- .../translation/run_translation_no_trainer.py | 2 +- .../contrastive-image-text/run_clip.py | 2 +- .../run_image_classification.py | 2 +- .../tensorflow/multiple-choice/run_swag.py | 2 +- .../tensorflow/question-answering/run_qa.py | 2 +- .../summarization/run_summarization.py | 2 +- .../text-classification/run_glue.py | 2 +- .../tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- ...lbert_original_tf_checkpoint_to_pytorch.py | 62 - .../models/align/convert_align_tf_to_hf.py | 389 ----- .../models/aria/convert_aria_weights_to_hf.py | 162 -- ...trogram_transformer_original_to_pytorch.py | 279 ---- .../bamba/convert_mamba_ssm_checkpoint.py | 273 ---- .../models/bark/convert_suno_to_hf.py | 263 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 156 -- .../beit/convert_beit_unilm_to_pytorch.py | 373 ----- ...bert_original_tf2_checkpoint_to_pytorch.py | 246 --- ..._bert_original_tf_checkpoint_to_pytorch.py | 62 - ..._bert_pytorch_checkpoint_to_original_tf.py | 112 -- ...ping_original_tf2_checkpoint_to_pytorch.py | 188 --- ...gbird_original_tf_checkpoint_to_pytorch.py | 69 - .../convert_bigbird_pegasus_tf_to_pytorch.py | 170 --- ..._original_pytorch_checkpoint_to_pytorch.py | 292 ---- .../models/bit/convert_bit_to_pytorch.py | 177 --- ..._original_pytorch_checkpoint_to_pytorch.py | 114 -- .../convert_blip_original_pytorch_to_hf.py | 191 --- .../convert_blip_2_original_to_pytorch.py | 390 ----- ...rt_bloom_original_checkpoint_to_pytorch.py | 254 ---- .../models/bros/convert_bros_to_pytorch.py | 145 -- ..._byt5_original_tf_checkpoint_to_pytorch.py | 59 - ...anine_original_tf_checkpoint_to_pytorch.py | 65 - .../convert_chameleon_weights_to_hf.py | 478 ------ ...ert_chinese_clip_original_pytorch_to_hf.py | 134 -- .../convert_clap_original_pytorch_to_hf.py | 133 -- .../convert_clip_original_pytorch_to_hf.py | 156 -- .../convert_clipseg_original_pytorch_to_hf.py | 264 ---- .../models/clvp/convert_clvp_to_hf.py | 234 --- .../colpali/convert_colpali_weights_to_hf.py | 214 --- ..._original_pytorch_checkpoint_to_pytorch.py | 324 ---- ...ginal_tf1_checkpoint_to_pytorch_and_tf2.py | 57 - .../convnext/convert_convnext_to_pytorch.py | 242 --- .../convert_convnextv2_to_pytorch.py | 286 ---- src/transformers/models/csm/convert_csm.py | 339 ----- ..._original_pytorch_checkpoint_to_pytorch.py | 362 ----- ..._fine_original_pytorch_checkpoint_to_hf.py | 689 --------- ..._original_pytorch_checkpoint_to_pytorch.py | 234 --- .../models/dac/convert_dac_checkpoint.py | 261 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 285 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 207 --- ..._original_pytorch_checkpoint_to_pytorch.py | 374 ----- .../convert_deformable_detr_to_pytorch.py | 236 --- .../deit/convert_deit_timm_to_pytorch.py | 218 --- ...original_gluonnlp_checkpoint_to_pytorch.py | 318 ---- .../deta/convert_deta_resnet_to_pytorch.py | 319 ---- .../deta/convert_deta_swin_to_pytorch.py | 326 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 252 ---- ...convert_gptsan_tf_checkpoint_to_pytorch.py | 181 --- .../deprecated/jukebox/convert_jukebox.py | 279 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 298 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 70 - ...fo_xl_original_tf_checkpoint_to_pytorch.py | 121 -- .../deprecated/van/convert_van_to_pytorch.py | 290 ---- .../convert_vit_hybrid_timm_to_pytorch.py | 282 ---- .../convert_depth_anything_to_hf.py | 368 ----- .../convert_distill_any_depth_to_hf.py | 246 --- .../convert_depth_pro_weights_to_hf.py | 255 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 277 ---- .../models/detr/convert_detr_to_pytorch.py | 385 ----- ..._original_pytorch_checkpoint_to_pytorch.py | 46 - .../models/dinov2/convert_dinov2_to_hf.py | 285 ---- .../convert_dinov2_with_registers_to_hf.py | 291 ---- .../dit/convert_dit_unilm_to_pytorch.py | 230 --- .../models/donut/convert_donut_to_pytorch.py | 234 --- ...vert_dpr_original_checkpoint_to_pytorch.py | 145 -- .../models/dpt/convert_dinov2_depth_to_hf.py | 383 ----- .../models/dpt/convert_dpt_beit_to_hf.py | 305 ---- .../dpt/convert_dpt_hybrid_to_pytorch.py | 315 ---- .../models/dpt/convert_dpt_swinv2_to_hf.py | 321 ---- .../models/dpt/convert_dpt_to_pytorch.py | 285 ---- .../convert_efficientnet_to_pytorch.py | 339 ----- ...ectra_original_tf_checkpoint_to_pytorch.py | 79 - .../models/emu3/convert_emu3_weights_to_hf.py | 448 ------ .../convert_encodec_checkpoint_to_pytorch.py | 365 ----- src/transformers/models/esm/convert_esm.py | 399 ----- .../falcon/convert_custom_code_checkpoint.py | 74 - ..._original_pytorch_checkpoint_to_pytorch.py | 210 --- .../fastspeech2_conformer/convert_hifigan.py | 134 -- .../convert_model_with_hifigan.py | 102 -- .../flava/convert_dalle_to_flava_codebook.py | 102 -- .../convert_flava_original_pytorch_to_hf.py | 99 -- ...net_original_flax_checkpoint_to_pytorch.py | 156 -- .../focalnet/convert_focalnet_to_hf_format.py | 237 --- ..._original_pytorch_checkpoint_to_pytorch.py | 280 ---- ...unnel_original_tf_checkpoint_to_pytorch.py | 64 - .../fuyu/convert_fuyu_model_weights_to_hf.py | 134 -- .../gemma/convert_gemma_weights_to_hf.py | 206 --- .../gemma2/convert_gemma2_weights_to_hf.py | 239 --- .../convert_gemma3_weights_orbax_to_hf.py | 594 -------- .../models/git/convert_git_to_pytorch.py | 448 ------ .../models/glm/convert_glm_weights_to_hf.py | 195 --- .../models/glm4/convert_glm4_weights_to_hf.py | 199 --- .../models/glpn/convert_glpn_to_pytorch.py | 218 --- .../convert_got_ocr2_weights_to_hf.py | 274 ---- ..._gpt2_original_tf_checkpoint_to_pytorch.py | 68 - .../convert_gpt_neo_mesh_tf_to_pytorch.py | 71 - .../gpt_sw3/convert_megatron_to_pytorch.py | 197 --- .../convert_grounding_dino_to_hf.py | 491 ------ .../groupvit/convert_groupvit_nvlab_to_hf.py | 217 --- .../models/hiera/convert_hiera_to_hf.py | 369 ----- ...rt_original_s3prl_checkpoint_to_pytorch.py | 222 --- ..._original_pytorch_checkpoint_to_pytorch.py | 261 ---- ...rt_original_s3prl_checkpoint_to_pytorch.py | 68 - .../convert_idefics2_weights_to_hf.py | 185 --- .../convert_idefics3_weights_to_hf.py | 214 --- .../models/ijepa/convert_ijepa_to_hf.py | 268 ---- ...onvert_imagegpt_original_tf2_to_pytorch.py | 71 - ...onvert_instructblip_original_to_pytorch.py | 303 ---- ...t_instructblipvideo_original_to_pytorch.py | 305 ---- .../convert_internvl_weights_to_hf.py | 421 ------ .../janus/convert_janus_weights_to_hf.py | 504 ------- ..._original_pytorch_checkpoint_to_pytorch.py | 77 - .../levit/convert_levit_timm_to_pytorch.py | 181 --- .../llama/convert_llama_weights_to_hf.py | 606 -------- .../llama4/convert_llama4_weights_to_hf.py | 743 --------- .../llava/convert_llava_weights_to_hf.py | 204 --- .../convert_llava_next_weights_to_hf.py | 397 ----- .../convert_llava_next_video_weights_to_hf.py | 276 ---- .../convert_llava_onevision_weights_to_hf.py | 388 ----- ...r_original_pytorch_lightning_to_pytorch.py | 85 -- .../convert_longt5x_checkpoint_to_flax.py | 215 --- ..._original_pytorch_checkpoint_to_pytorch.py | 170 --- ...xmert_original_tf_checkpoint_to_pytorch.py | 59 - ...t_m2m100_original_checkpoint_to_pytorch.py | 85 -- ...convert_mamba_ssm_checkpoint_to_pytorch.py | 153 -- ...onvert_mamba2_ssm_checkpoint_to_pytorch.py | 193 --- .../convert_marian_tatoeba_to_pytorch.py | 1327 ----------------- .../marian/convert_marian_to_pytorch.py | 717 --------- ..._original_pytorch_checkpoint_to_pytorch.py | 1019 ------------- ..._original_pytorch_checkpoint_to_pytorch.py | 731 --------- .../convert_maskformer_resnet_to_pytorch.py | 390 ----- .../convert_maskformer_swin_to_pytorch.py | 333 ----- ...rt_mbart_original_checkpoint_to_pytorch.py | 83 -- .../convert_megatron_bert_checkpoint.py | 334 ----- .../convert_megatron_gpt2_checkpoint.py | 358 ----- .../convert_mimi_checkpoint_to_pytorch.py | 198 --- .../mistral/convert_mistral_weights_to_hf.py | 284 ---- .../convert_mistral3_weights_to_hf.py | 241 --- .../mixtral/convert_mixtral_weights_to_hf.py | 245 --- .../models/mlcd/convert_mlcd_weights_to_hf.py | 336 ----- .../mllama/convert_mllama_weights_to_hf.py | 644 -------- ..._original_pytorch_checkpoint_to_pytorch.py | 229 --- ...ebert_original_tf_checkpoint_to_pytorch.py | 58 - ...nvert_original_tf_checkpoint_to_pytorch.py | 141 -- ...nvert_original_tf_checkpoint_to_pytorch.py | 177 --- .../mobilevit/convert_mlcvnets_to_pytorch.py | 311 ---- .../convert_mlcvnets_to_pytorch.py | 332 ----- .../moonshine/convert_usefulsensors_to_hf.py | 169 --- .../moshi/convert_moshi_transformers.py | 311 ---- .../mra/convert_mra_pytorch_to_pytorch.py | 110 -- .../musicgen/convert_musicgen_transformers.py | 236 --- .../convert_musicgen_melody_transformers.py | 267 ---- ..._myt5_original_tf_checkpoint_to_pytorch.py | 60 - .../nemotron/convert_nemotron_nemo_to_hf.py | 346 ----- ..._sharded_original_checkpoint_to_pytorch.py | 161 -- .../models/nougat/convert_nougat_to_hf.py | 282 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 111 -- .../models/olmo/convert_olmo_weights_to_hf.py | 248 --- .../olmo2/convert_olmo2_weights_to_hf.py | 306 ---- .../olmoe/convert_olmoe_weights_to_hf.py | 281 ---- .../omdet_turbo/convert_omdet_turbo_to_hf.py | 349 ----- .../oneformer/convert_to_hf_oneformer.py | 1191 --------------- ...penai_original_tf_checkpoint_to_pytorch.py | 74 - ..._original_pytorch_checkpoint_to_pytorch.py | 113 -- .../models/owlv2/convert_owlv2_to_hf.py | 422 ------ .../convert_owlvit_original_flax_to_hf.py | 406 ----- .../convert_paligemma2_weights_to_hf.py | 415 ------ .../convert_paligemma_weights_to_hf.py | 347 ----- .../pegasus/convert_pegasus_tf_to_pytorch.py | 131 -- .../convert_perceiver_haiku_to_pytorch.py | 468 ------ .../convert_persimmon_weights_to_hf.py | 129 -- .../models/phi/convert_phi_weights_to_hf.py | 207 --- .../convert_phi4_multimodal_weights_to_hf.py | 271 ---- ...nvert_pix2struct_original_pytorch_to_hf.py | 155 -- .../pixtral/convert_pixtral_weights_to_hf.py | 245 --- ...ert_plbart_original_checkpoint_to_torch.py | 94 -- .../convert_poolformer_original_to_pytorch.py | 214 --- .../convert_pop2piano_weights_to_hf.py | 190 --- .../convert_prompt_depth_anything_to_hf.py | 293 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 159 -- .../models/pvt/convert_pvt_to_pytorch.py | 226 --- .../pvt_v2/convert_pvt_v2_to_pytorch.py | 294 ---- .../convert_recurrent_gemma_to_hf.py | 222 --- ...ert_reformer_trax_checkpoint_to_pytorch.py | 226 --- .../convert_regnet_seer_10b_to_pytorch.py | 308 ---- .../regnet/convert_regnet_to_pytorch.py | 458 ------ ...onvert_rembert_tf_checkpoint_to_pytorch.py | 62 - .../resnet/convert_resnet_to_pytorch.py | 199 --- ..._original_pytorch_checkpoint_to_pytorch.py | 177 --- ..._original_pytorch_checkpoint_to_pytorch.py | 79 - ...ormer_original_tf_checkpoint_to_pytorch.py | 62 - ..._detr_original_pytorch_checkpoint_to_hf.py | 782 ---------- .../convert_rt_detr_v2_weights_to_hf.py | 364 ----- .../rwkv/convert_rwkv_checkpoint_to_hf.py | 209 --- .../models/sam/convert_sam_to_hf.py | 251 ---- .../models/sam_hq/convert_samhq_to_hf.py | 277 ---- .../seamless_m4t/convert_fairseq2_to_hf.py | 396 ----- .../seamless_m4t_v2/convert_fairseq2_to_hf.py | 404 ----- .../convert_segformer_original_to_pytorch.py | 387 ----- .../models/seggpt/convert_seggpt_to_hf.py | 221 --- ..._original_pytorch_checkpoint_to_pytorch.py | 305 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 317 ---- ...onvert_shieldgemma2_weights_orbax_to_hf.py | 470 ------ .../models/siglip/convert_siglip_to_hf.py | 533 ------- .../models/siglip2/convert_siglip2_to_hf.py | 438 ------ ...rt_wav2vec2_seq2seq_original_to_pytorch.py | 357 ----- ...xt_wav2vec2_seq2seq_original_to_pytorch.py | 316 ---- .../convert_s2t_fairseq_to_tfms.py | 121 -- .../models/speecht5/convert_hifigan.py | 108 -- ..._original_pytorch_checkpoint_to_pytorch.py | 401 ----- .../superglue/convert_superglue_to_hf.py | 342 ----- .../convert_superpoint_to_pytorch.py | 175 --- .../convert_swiftformer_original_to_hf.py | 175 --- .../swin/convert_swin_simmim_to_pytorch.py | 182 --- .../swin/convert_swin_timm_to_pytorch.py | 173 --- .../convert_swin2sr_original_to_pytorch.py | 278 ---- .../swinv2/convert_swinv2_timm_to_pytorch.py | 219 --- .../switch_transformers/convert_big_switch.py | 194 --- ...ers_original_flax_checkpoint_to_pytorch.py | 203 --- ...rt_t5_original_tf_checkpoint_to_pytorch.py | 59 - .../t5/convert_t5x_checkpoint_to_flax.py | 235 --- .../t5/convert_t5x_checkpoint_to_pytorch.py | 238 --- .../convert_table_transformer_to_hf.py | 317 ---- ...convert_table_transformer_to_hf_no_timm.py | 434 ------ ...tapas_original_tf_checkpoint_to_pytorch.py | 137 -- .../models/textnet/convert_textnet_to_hf.py | 208 --- .../timesfm/convert_timesfm_orignal_to_hf.py | 277 ---- .../convert_timesformer_to_pytorch.py | 253 ---- .../trocr/convert_trocr_unilm_to_pytorch.py | 237 --- .../models/udop/convert_udop_to_hf.py | 224 --- .../convert_umt5_checkpoint_to_pytorch.py | 274 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 273 ---- ...ch_original_s3prl_checkpoint_to_pytorch.py | 109 -- ..._original_pytorch_checkpoint_to_pytorch.py | 224 --- .../models/univnet/convert_univnet.py | 162 -- .../convert_convnext_upernet_to_pytorch.py | 214 --- .../convert_swin_upernet_to_pytorch.py | 297 ---- .../convert_video_llava_weights_to_hf.py | 159 -- .../videomae/convert_videomae_to_pytorch.py | 324 ---- .../vilt/convert_vilt_original_to_pytorch.py | 299 ---- .../convert_vipllava_weights_to_hf.py | 132 -- ..._original_pytorch_checkpoint_to_pytorch.py | 149 -- .../models/vit/convert_dino_to_pytorch.py | 218 --- .../models/vit/convert_vit_timm_to_pytorch.py | 254 ---- .../vit_mae/convert_vit_mae_to_pytorch.py | 178 --- .../models/vit_msn/convert_msn_to_pytorch.py | 245 --- .../models/vitmatte/convert_vitmatte_to_hf.py | 170 --- .../models/vitpose/convert_vitpose_to_hf.py | 429 ------ .../vits/convert_original_checkpoint.py | 390 ----- .../vivit/convert_vivit_flax_to_pytorch.py | 231 --- ..._original_pytorch_checkpoint_to_pytorch.py | 385 ----- ...c2_original_s3prl_checkpoint_to_pytorch.py | 109 -- .../convert_wav2vec2_seamless_checkpoint.py | 217 --- ..._original_pytorch_checkpoint_to_pytorch.py | 309 ---- ..._original_pytorch_checkpoint_to_pytorch.py | 206 --- ...lm_original_s3prl_checkpoint_to_pytorch.py | 109 -- .../models/whisper/convert_openai_to_hf.py | 370 ----- .../convert_x_clip_original_pytorch_to_hf.py | 386 ----- .../convert_xglm_original_ckpt_to_trfms.py | 68 - ..._original_pytorch_checkpoint_to_pytorch.py | 77 - ..._original_pytorch_checkpoint_to_pytorch.py | 183 --- ...xlnet_original_tf_checkpoint_to_pytorch.py | 113 -- ..._original_pytorch_checkpoint_to_pytorch.py | 212 --- .../models/yolos/convert_yolos_to_pytorch.py | 267 ---- .../yoso/convert_yoso_pytorch_to_pytorch.py | 108 -- .../models/zoedepth/convert_zoedepth_to_hf.py | 426 ------ 320 files changed, 53 insertions(+), 70095 deletions(-) delete mode 100644 src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/align/convert_align_tf_to_hf.py delete mode 100644 src/transformers/models/aria/convert_aria_weights_to_hf.py delete mode 100644 src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py delete mode 100644 src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py delete mode 100644 src/transformers/models/bark/convert_suno_to_hf.py delete mode 100644 src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/beit/convert_beit_unilm_to_pytorch.py delete mode 100644 src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py delete mode 100644 src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py delete mode 100755 src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bit/convert_bit_to_pytorch.py delete mode 100644 src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py delete mode 100644 src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/bros/convert_bros_to_pytorch.py delete mode 100755 src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py delete mode 100644 src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/clvp/convert_clvp_to_hf.py delete mode 100644 src/transformers/models/colpali/convert_colpali_weights_to_hf.py delete mode 100644 src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py delete mode 100644 src/transformers/models/convnext/convert_convnext_to_pytorch.py delete mode 100644 src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py delete mode 100644 src/transformers/models/csm/convert_csm.py delete mode 100644 src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py delete mode 100644 src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/dac/convert_dac_checkpoint.py delete mode 100644 src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py delete mode 100644 src/transformers/models/deit/convert_deit_timm_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/jukebox/convert_jukebox.py delete mode 100644 src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/van/convert_van_to_pytorch.py delete mode 100644 src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py delete mode 100644 src/transformers/models/depth_anything/convert_depth_anything_to_hf.py delete mode 100644 src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py delete mode 100644 src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py delete mode 100644 src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/detr/convert_detr_to_pytorch.py delete mode 100644 src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/dinov2/convert_dinov2_to_hf.py delete mode 100644 src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py delete mode 100644 src/transformers/models/dit/convert_dit_unilm_to_pytorch.py delete mode 100644 src/transformers/models/donut/convert_donut_to_pytorch.py delete mode 100644 src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/dpt/convert_dinov2_depth_to_hf.py delete mode 100644 src/transformers/models/dpt/convert_dpt_beit_to_hf.py delete mode 100644 src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py delete mode 100644 src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py delete mode 100644 src/transformers/models/dpt/convert_dpt_to_pytorch.py delete mode 100644 src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py delete mode 100644 src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/emu3/convert_emu3_weights_to_hf.py delete mode 100644 src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/esm/convert_esm.py delete mode 100644 src/transformers/models/falcon/convert_custom_code_checkpoint.py delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_hifigan.py delete mode 100644 src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py delete mode 100644 src/transformers/models/flava/convert_dalle_to_flava_codebook.py delete mode 100644 src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/focalnet/convert_focalnet_to_hf_format.py delete mode 100755 src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py delete mode 100644 src/transformers/models/gemma/convert_gemma_weights_to_hf.py delete mode 100644 src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py delete mode 100644 src/transformers/models/gemma3/convert_gemma3_weights_orbax_to_hf.py delete mode 100644 src/transformers/models/git/convert_git_to_pytorch.py delete mode 100644 src/transformers/models/glm/convert_glm_weights_to_hf.py delete mode 100644 src/transformers/models/glm4/convert_glm4_weights_to_hf.py delete mode 100644 src/transformers/models/glpn/convert_glpn_to_pytorch.py delete mode 100644 src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py delete mode 100755 src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py delete mode 100644 src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py delete mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py delete mode 100644 src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py delete mode 100644 src/transformers/models/hiera/convert_hiera_to_hf.py delete mode 100644 src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py delete mode 100644 src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py delete mode 100644 src/transformers/models/ijepa/convert_ijepa_to_hf.py delete mode 100644 src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py delete mode 100644 src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py delete mode 100644 src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py delete mode 100644 src/transformers/models/internvl/convert_internvl_weights_to_hf.py delete mode 100644 src/transformers/models/janus/convert_janus_weights_to_hf.py delete mode 100644 src/transformers/models/kosmos2/convert_kosmos2_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/levit/convert_levit_timm_to_pytorch.py delete mode 100644 src/transformers/models/llama/convert_llama_weights_to_hf.py delete mode 100644 src/transformers/models/llama4/convert_llama4_weights_to_hf.py delete mode 100644 src/transformers/models/llava/convert_llava_weights_to_hf.py delete mode 100644 src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py delete mode 100644 src/transformers/models/llava_next_video/convert_llava_next_video_weights_to_hf.py delete mode 100644 src/transformers/models/llava_onevision/convert_llava_onevision_weights_to_hf.py delete mode 100644 src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py delete mode 100644 src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py delete mode 100644 src/transformers/models/luke/convert_luke_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mamba2/convert_mamba2_ssm_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py delete mode 100644 src/transformers/models/marian/convert_marian_to_pytorch.py delete mode 100644 src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py delete mode 100644 src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py delete mode 100644 src/transformers/models/mbart/convert_mbart_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py delete mode 100644 src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py delete mode 100644 src/transformers/models/mimi/convert_mimi_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mistral/convert_mistral_weights_to_hf.py delete mode 100644 src/transformers/models/mistral3/convert_mistral3_weights_to_hf.py delete mode 100644 src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py delete mode 100644 src/transformers/models/mlcd/convert_mlcd_weights_to_hf.py delete mode 100644 src/transformers/models/mllama/convert_mllama_weights_to_hf.py delete mode 100644 src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py delete mode 100644 src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py delete mode 100644 src/transformers/models/moonshine/convert_usefulsensors_to_hf.py delete mode 100644 src/transformers/models/moshi/convert_moshi_transformers.py delete mode 100644 src/transformers/models/mra/convert_mra_pytorch_to_pytorch.py delete mode 100644 src/transformers/models/musicgen/convert_musicgen_transformers.py delete mode 100644 src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py delete mode 100644 src/transformers/models/myt5/convert_myt5_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/nemotron/convert_nemotron_nemo_to_hf.py delete mode 100644 src/transformers/models/nllb_moe/convert_nllb_moe_sharded_original_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/nougat/convert_nougat_to_hf.py delete mode 100644 src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/olmo/convert_olmo_weights_to_hf.py delete mode 100644 src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py delete mode 100644 src/transformers/models/olmoe/convert_olmoe_weights_to_hf.py delete mode 100644 src/transformers/models/omdet_turbo/convert_omdet_turbo_to_hf.py delete mode 100644 src/transformers/models/oneformer/convert_to_hf_oneformer.py delete mode 100755 src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/owlv2/convert_owlv2_to_hf.py delete mode 100644 src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py delete mode 100644 src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py delete mode 100644 src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py delete mode 100644 src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py delete mode 100644 src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py delete mode 100644 src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py delete mode 100644 src/transformers/models/phi/convert_phi_weights_to_hf.py delete mode 100644 src/transformers/models/phi4_multimodal/convert_phi4_multimodal_weights_to_hf.py delete mode 100644 src/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py delete mode 100644 src/transformers/models/plbart/convert_plbart_original_checkpoint_to_torch.py delete mode 100644 src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py delete mode 100644 src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py delete mode 100644 src/transformers/models/prompt_depth_anything/convert_prompt_depth_anything_to_hf.py delete mode 100644 src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/pvt/convert_pvt_to_pytorch.py delete mode 100644 src/transformers/models/pvt_v2/convert_pvt_v2_to_pytorch.py delete mode 100644 src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py delete mode 100755 src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py delete mode 100644 src/transformers/models/regnet/convert_regnet_to_pytorch.py delete mode 100755 src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/resnet/convert_resnet_to_pytorch.py delete mode 100644 src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/rt_detr/convert_rt_detr_original_pytorch_checkpoint_to_hf.py delete mode 100644 src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py delete mode 100644 src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py delete mode 100644 src/transformers/models/sam/convert_sam_to_hf.py delete mode 100644 src/transformers/models/sam_hq/convert_samhq_to_hf.py delete mode 100644 src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py delete mode 100644 src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py delete mode 100644 src/transformers/models/segformer/convert_segformer_original_to_pytorch.py delete mode 100644 src/transformers/models/seggpt/convert_seggpt_to_hf.py delete mode 100644 src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py delete mode 100644 src/transformers/models/siglip/convert_siglip_to_hf.py delete mode 100644 src/transformers/models/siglip2/convert_siglip2_to_hf.py delete mode 100644 src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py delete mode 100644 src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py delete mode 100644 src/transformers/models/speech_to_text/convert_s2t_fairseq_to_tfms.py delete mode 100644 src/transformers/models/speecht5/convert_hifigan.py delete mode 100644 src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/superglue/convert_superglue_to_hf.py delete mode 100644 src/transformers/models/superpoint/convert_superpoint_to_pytorch.py delete mode 100644 src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py delete mode 100644 src/transformers/models/swin/convert_swin_simmim_to_pytorch.py delete mode 100644 src/transformers/models/swin/convert_swin_timm_to_pytorch.py delete mode 100644 src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py delete mode 100644 src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py delete mode 100644 src/transformers/models/switch_transformers/convert_big_switch.py delete mode 100644 src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py delete mode 100755 src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/table_transformer/convert_table_transformer_to_hf.py delete mode 100644 src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py delete mode 100644 src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/textnet/convert_textnet_to_hf.py delete mode 100644 src/transformers/models/timesfm/convert_timesfm_orignal_to_hf.py delete mode 100644 src/transformers/models/timesformer/convert_timesformer_to_pytorch.py delete mode 100644 src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py delete mode 100644 src/transformers/models/udop/convert_udop_to_hf.py delete mode 100644 src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/univnet/convert_univnet.py delete mode 100644 src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py delete mode 100644 src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py delete mode 100644 src/transformers/models/video_llava/convert_video_llava_weights_to_hf.py delete mode 100644 src/transformers/models/videomae/convert_videomae_to_pytorch.py delete mode 100644 src/transformers/models/vilt/convert_vilt_original_to_pytorch.py delete mode 100644 src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py delete mode 100644 src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/vit/convert_dino_to_pytorch.py delete mode 100644 src/transformers/models/vit/convert_vit_timm_to_pytorch.py delete mode 100644 src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py delete mode 100644 src/transformers/models/vit_msn/convert_msn_to_pytorch.py delete mode 100644 src/transformers/models/vitmatte/convert_vitmatte_to_hf.py delete mode 100644 src/transformers/models/vitpose/convert_vitpose_to_hf.py delete mode 100644 src/transformers/models/vits/convert_original_checkpoint.py delete mode 100644 src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py delete mode 100644 src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py delete mode 100644 src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/whisper/convert_openai_to_hf.py delete mode 100644 src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py delete mode 100644 src/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py delete mode 100755 src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py delete mode 100755 src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py delete mode 100644 src/transformers/models/yolos/convert_yolos_to_pytorch.py delete mode 100644 src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py delete mode 100644 src/transformers/models/zoedepth/convert_zoedepth_to_hf.py diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index eb506fff04..bcac34d662 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -60,7 +60,7 @@ from transformers.utils import check_min_version, send_example_telemetry logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 31c1e84e14..33a7107322 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -59,7 +59,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index b5378b8c11..5c4c158d85 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 854d7c7136..a52f881494 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -56,7 +56,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 8ea627dbdd..e78fa91662 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -44,7 +44,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index 2f5d8eef7c..c25696c6bc 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -53,7 +53,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 9d70c80ea6..bf590c8f16 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 45259b1b0a..6bef954be6 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -48,7 +48,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index f39fcd17c0..2a56148d6f 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -42,7 +42,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 842e7c8d60..b44ec54cbf 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -47,7 +47,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used. logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 13a545d4cb..bfd1d25144 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -52,7 +52,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used. logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 6c4087120d..529c04a32c 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index fb33681168..bb2706e28e 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 44869b004b..3f224ffd6c 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -54,7 +54,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 2a5a0d5d72..b80db9a9d2 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -56,7 +56,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index ac97a3c10d..742ba901bc 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -57,7 +57,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index a53cd5740a..fa86db28c0 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -59,7 +59,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 7a9abff0e3..91fafcb976 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -53,7 +53,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index d8285bdc0d..1e0c0ae629 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -56,7 +56,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 0eda261d17..901b14f6d3 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -46,7 +46,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 3817a2d55b..c09bf1bfa0 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -45,7 +45,7 @@ from transformers.utils import check_min_version, send_example_telemetry # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 851346c6d2..ec8b52b2d5 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -53,7 +53,7 @@ from transformers.utils import check_min_version, send_example_telemetry # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index cc319d331e..c3c327178f 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 5265ca0c40..adba2590f7 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index dbdc52cede..50dc93a026 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -49,7 +49,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 6ea909a7da..98b8875298 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -47,7 +47,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 84bead830a..5b356869e2 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -54,7 +54,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 138dd61f99..ca5cc93db8 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -56,7 +56,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index a07e34f091..178a03e084 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -45,7 +45,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index bfedf7c4ca..4908b59994 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -50,7 +50,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index cd2a7a895b..41a8b016c7 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -49,7 +49,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 53a1f98c89..3c07586122 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -49,7 +49,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 8c1a66c7f6..66f25965d2 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -52,7 +52,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 1b64ea078d..ef58123d3f 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -47,7 +47,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index ce63c1c7f0..379513fe1c 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -51,7 +51,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 4351ecffd1..69bd5c0814 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -55,7 +55,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index 62d71b7f28..911e923732 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -46,7 +46,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 7f8a266b44..5ebdc3014d 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index eab318867b..140d401251 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -48,7 +48,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 86ecb0f63a..050658b152 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -47,7 +47,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 28c344de27..7d3d1981ec 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -48,7 +48,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index cf42f0d01c..502b63215f 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -55,7 +55,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 1310668646..c32c6390be 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -51,7 +51,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 876bf3ebdb..e8a56c80d1 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -56,7 +56,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 1b084a603d..2c3245681c 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -50,7 +50,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index c9f9c9750b..78e1aff8b6 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -54,7 +54,7 @@ from transformers.utils.versions import require_version logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 9b6ba4228c..a7614caf4b 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -49,7 +49,7 @@ from transformers.utils import check_min_version, send_example_telemetry # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index 28418496c2..a56eb2045d 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -61,7 +61,7 @@ except (ModuleNotFoundError, ImportError): # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 2a2ef3fb76..8053de01be 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -52,7 +52,7 @@ from transformers.utils.versions import require_version # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index 2e9096b364..aa2b15e2ae 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -46,7 +46,7 @@ from transformers.utils import check_min_version, send_example_telemetry # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 5d9771d425..6aadb3ec4c 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -55,7 +55,7 @@ from transformers.utils.versions import require_version # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.52.0.dev0") +check_min_version("4.52.0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index c3888e5373..8f6e4f6eee 100644 --- a/setup.py +++ b/setup.py @@ -451,7 +451,7 @@ install_requires = [ setup( name="transformers", - version="4.52.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.52.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e2d20fb127..350df2f51c 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.52.0.dev0" +__version__ = "4.52.0" from pathlib import Path from typing import TYPE_CHECKING diff --git a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index df2a226101..0000000000 --- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ALBERT checkpoint.""" - -import argparse - -import torch - -from ...utils import logging -from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): - # Initialise PyTorch model - config = AlbertConfig.from_json_file(albert_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = AlbertForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_albert(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--albert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained ALBERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py deleted file mode 100644 index 610db8482f..0000000000 --- a/src/transformers/models/align/convert_align_tf_to_hf.py +++ /dev/null @@ -1,389 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ALIGN checkpoints from the original repository.""" - -import argparse -import os - -import align -import numpy as np -import requests -import tensorflow as tf -import torch -from PIL import Image -from tokenizer import Tokenizer - -from transformers import ( - AlignConfig, - AlignModel, - AlignProcessor, - BertConfig, - BertTokenizer, - EfficientNetConfig, - EfficientNetImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def preprocess(image): - image = tf.image.resize(image, (346, 346)) - image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289) - return image - - -def get_align_config(): - vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7") - vision_config.image_size = 289 - vision_config.hidden_dim = 640 - vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"} - vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1} - vision_config.depthwise_padding = [] - - text_config = BertConfig() - config = AlignConfig.from_text_vision_configs( - text_config=text_config, vision_config=vision_config, projection_dim=640 - ) - return config - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_processor(): - image_processor = EfficientNetImageProcessor( - do_center_crop=True, - rescale_factor=1 / 127.5, - rescale_offset=True, - do_normalize=False, - include_top=False, - resample=Image.BILINEAR, - ) - tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - tokenizer.model_max_length = 64 - processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer) - return processor - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def rename_keys(original_param_names): - # EfficientNet image encoder - block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")] - block_names = list(set(block_names)) - block_names = sorted(block_names) - num_blocks = len(block_names) - block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))} - - rename_keys = [] - rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight")) - rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight")) - rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias")) - rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean")) - rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var")) - - for b in block_names: - hf_b = block_name_mapping[b] - rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight")) - rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight")) - rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias")) - rename_keys.append( - (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var") - ) - rename_keys.append( - (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight") - ) - rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight")) - rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias")) - rename_keys.append( - (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean") - ) - rename_keys.append( - (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var") - ) - - rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight")) - rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias")) - rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight")) - rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias")) - rename_keys.append( - (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight") - ) - rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight")) - rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias")) - rename_keys.append( - (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var") - ) - - key_mapping = {} - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = "vision_model." + item[1] - - # BERT text encoder - rename_keys = [] - old = "tf_bert_model/bert" - new = "text_model" - for i in range(12): - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.query.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/query/bias:0", - f"{new}.encoder.layer.{i}.attention.self.query.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.key.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/key/bias:0", - f"{new}.encoder.layer.{i}.attention.self.key.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0", - f"{new}.encoder.layer.{i}.attention.self.value.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/self/value/bias:0", - f"{new}.encoder.layer.{i}.attention.self.value.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0", - f"{new}.encoder.layer.{i}.attention.output.dense.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0", - f"{new}.encoder.layer.{i}.attention.output.dense.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0", - f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0", - f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0", - f"{new}.encoder.layer.{i}.intermediate.dense.weight", - ) - ) - rename_keys.append( - ( - f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0", - f"{new}.encoder.layer.{i}.intermediate.dense.bias", - ) - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight") - ) - rename_keys.append( - (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias") - ) - - rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight")) - rename_keys.append( - (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight") - ) - rename_keys.append( - (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight") - ) - rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight")) - rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias")) - - rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight")) - rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias")) - rename_keys.append(("dense/kernel:0", "text_projection.weight")) - rename_keys.append(("dense/bias:0", "text_projection.bias")) - rename_keys.append(("dense/bias:0", "text_projection.bias")) - rename_keys.append(("temperature:0", "temperature")) - - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = item[1] - return key_mapping - - -def replace_params(hf_params, tf_params, key_mapping): - list(hf_params.keys()) - - for key, value in tf_params.items(): - if key not in key_mapping: - continue - - hf_key = key_mapping[key] - if "_conv" in key and "kernel" in key: - new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1) - elif "embeddings" in key: - new_hf_value = torch.from_numpy(value) - elif "depthwise_kernel" in key: - new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1) - elif "kernel" in key: - new_hf_value = torch.from_numpy(np.transpose(value)) - elif "temperature" in key: - new_hf_value = value - elif "bn/gamma" or "bn/beta" in key: - new_hf_value = torch.from_numpy(np.transpose(value)).squeeze() - else: - new_hf_value = torch.from_numpy(value) - - # Replace HF parameters with original TF model parameters - hf_params[hf_key].copy_(new_hf_value) - - -@torch.no_grad() -def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our ALIGN structure. - """ - # Load original model - seq_length = 64 - tok = Tokenizer(seq_length) - original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size()) - original_model.compile() - original_model.load_weights(checkpoint_path) - - tf_params = original_model.trainable_variables - tf_non_train_params = original_model.non_trainable_variables - tf_params = {param.name: param.numpy() for param in tf_params} - for param in tf_non_train_params: - tf_params[param.name] = param.numpy() - tf_param_names = list(tf_params.keys()) - - # Load HuggingFace model - config = get_align_config() - hf_model = AlignModel(config).eval() - hf_params = hf_model.state_dict() - - # Create src-to-dst parameter name mapping dictionary - print("Converting parameters...") - key_mapping = rename_keys(tf_param_names) - replace_params(hf_params, tf_params, key_mapping) - - # Initialize processor - processor = get_processor() - inputs = processor( - images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt" - ) - - # HF model inference - hf_model.eval() - with torch.no_grad(): - outputs = hf_model(**inputs) - - hf_image_features = outputs.image_embeds.detach().numpy() - hf_text_features = outputs.text_embeds.detach().numpy() - - # Original model inference - original_model.trainable = False - tf_image_processor = EfficientNetImageProcessor( - do_center_crop=True, - do_rescale=False, - do_normalize=False, - include_top=False, - resample=Image.BILINEAR, - ) - image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"] - text = tok(tf.constant(["A picture of a cat"])) - - image_features = original_model.image_encoder(image, training=False) - text_features = original_model.text_encoder(text, training=False) - - image_features = tf.nn.l2_normalize(image_features, axis=-1) - text_features = tf.nn.l2_normalize(text_features, axis=-1) - - # Check whether original and HF model outputs match -> np.allclose - if not np.allclose(image_features, hf_image_features, atol=1e-3): - raise ValueError("The predicted image features are not the same.") - if not np.allclose(text_features, hf_text_features, atol=1e-3): - raise ValueError("The predicted text features are not the same.") - print("Model outputs match!") - - if save_model: - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - # Save converted model and image processor - hf_model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Push model and image processor to hub - print("Pushing converted ALIGN to the hub...") - processor.push_to_hub("align-base") - hf_model.push_to_hub("align-base") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_path", - default="./weights/model-weights", - type=str, - help="Path to the pretrained TF ALIGN checkpoint.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="hf_model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - - args = parser.parse_args() - convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py deleted file mode 100644 index a95f3cda83..0000000000 --- a/src/transformers/models/aria/convert_aria_weights_to_hf.py +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import glob - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import ( - AddedToken, - AriaForConditionalGeneration, - AriaProcessor, - AutoConfig, - AutoTokenizer, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria - -Example for creating the old state dict file with Python: - - import torch - from aria.model.language_model.aria_llama import AriaTextForCausalLM - - # load model - kwargs = {"device_map": "auto", "torch_dtype": torch.float16} - model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs) - - # load vision tower - model.get_vision_tower().load_model() - - # Save state dict - torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin") -""" - -KEYS_TO_MODIFY_MAPPING = { - "vision_tower.vision_model": "vision_tower", - "ln_ffn": "layer_norm", - "ffn": "feed_forward", - "ln_kv": "layer_norm_kv", -} - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,)) - new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,)) - - return new_state_dict - - -def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): - torch.set_default_dtype(torch.float16) - - tokenizer = AutoTokenizer.from_pretrained( - text_model_id, - extra_special_tokens={ - "image_token": "<|img|>", - "pad_token": "", - }, - ) - tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True) - tokenizer.add_special_tokens({"pad_token": ""}) - tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<|img|>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" - - processor = AriaProcessor.from_pretrained( - text_model_id, - tokenizer=tokenizer, - ) - - config = AutoConfig.from_pretrained(text_model_id) - config.vision_config.hidden_size = 1152 - config.vision_config.attention_heads = 16 - config.pad_token_id = 2 - config.image_token_id = 9 - config.intermediate_size = config.moe_intermediate_size - config.auto_map = { - "AutoConfig": "modeling_aria.AriaConfig", - "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration", - } - - with torch.device("meta"): - model = AriaForConditionalGeneration(config) - - state_dict = load_original_state_dict(old_state_dict_id) - - state_dict = convert_state_dict_to_hf(state_dict) - model.load_state_dict(state_dict, strict=False, assign=True) - - # print("Saving models") - # model.save_pretrained("local_aria", safe_serialization=False) - # processor.save_pretrained("local_aria") - print("Pushing to hub") - model.push_to_hub(output_hub_path, create_pr=True) - processor.push_to_hub(output_hub_path, create_pr=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--text_model_id", - default="rhymes-ai/Aria", - help="Hub location of the text model", - ) - parser.add_argument( - "--vision_model_id", - default="rhymes-ai/Aria", - help="Hub location of the vision model", - ) - parser.add_argument( - "--output_hub_path", - default="rhymes-ai/Aria", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--old_state_dict_id", - default="rhymes-ai/Aria", - help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", - ) - args = parser.parse_args() - convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py deleted file mode 100644 index d211ef7ab0..0000000000 --- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py +++ /dev/null @@ -1,279 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast""" - -import argparse -import json -from pathlib import Path - -import torch -import torchaudio -from datasets import load_dataset -from huggingface_hub import hf_hub_download - -from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_audio_spectrogram_transformer_config(model_name): - config = ASTConfig() - - if "10-10" in model_name: - pass - elif "speech-commands" in model_name: - config.max_length = 128 - elif "12-12" in model_name: - config.time_stride = 12 - config.frequency_stride = 12 - elif "14-14" in model_name: - config.time_stride = 14 - config.frequency_stride = 14 - elif "16-16" in model_name: - config.time_stride = 16 - config.frequency_stride = 16 - else: - raise ValueError("Model not supported") - - repo_id = "huggingface/label-files" - if "speech-commands" in model_name: - config.num_labels = 35 - filename = "speech-commands-v2-id2label.json" - else: - config.num_labels = 527 - filename = "audioset-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -def rename_key(name): - if "module.v" in name: - name = name.replace("module.v", "audio_spectrogram_transformer") - if "cls_token" in name: - name = name.replace("cls_token", "embeddings.cls_token") - if "dist_token" in name: - name = name.replace("dist_token", "embeddings.distillation_token") - if "pos_embed" in name: - name = name.replace("pos_embed", "embeddings.position_embeddings") - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - # transformer blocks - if "blocks" in name: - name = name.replace("blocks", "encoder.layer") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "attn" in name: - name = name.replace("attn", "attention.self") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - # final layernorm - if "audio_spectrogram_transformer.norm" in name: - name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm") - # classifier head - if "module.mlp_head.0" in name: - name = name.replace("module.mlp_head.0", "classifier.layernorm") - if "module.mlp_head.1" in name: - name = name.replace("module.mlp_head.1", "classifier.dense") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - key_split = key.split(".") - layer_num = int(key_split[3]) - dim = config.hidden_size - if "weight" in key: - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight" - ] = val[:dim, :] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight" - ] = val[dim : dim * 2, :] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight" - ] = val[-dim:, :] - else: - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias" - ] = val[:dim] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias" - ] = val[dim : dim * 2] - orig_state_dict[ - f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias" - ] = val[-dim:] - else: - orig_state_dict[rename_key(key)] = val - - return orig_state_dict - - -def remove_keys(state_dict): - ignore_keys = [ - "module.v.head.weight", - "module.v.head.bias", - "module.v.head_dist.weight", - "module.v.head_dist.bias", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -@torch.no_grad() -def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure. - """ - config = get_audio_spectrogram_transformer_config(model_name) - - model_name_to_url = { - "ast-finetuned-audioset-10-10-0.4593": ( - "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.450": ( - "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.448": ( - "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1" - ), - "ast-finetuned-audioset-10-10-0.448-v2": ( - "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1" - ), - "ast-finetuned-audioset-12-12-0.447": ( - "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1" - ), - "ast-finetuned-audioset-14-14-0.443": ( - "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1" - ), - "ast-finetuned-audioset-16-16-0.442": ( - "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1" - ), - "ast-finetuned-speech-commands-v2": ( - "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1" - ), - } - - # load original state_dict - checkpoint_url = model_name_to_url[model_name] - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove some keys - remove_keys(state_dict) - # rename some keys - new_state_dict = convert_state_dict(state_dict, config) - - # load đŸ€— model - model = ASTForAudioClassification(config) - model.eval() - - model.load_state_dict(new_state_dict) - - # verify outputs on dummy input - # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62 - mean = -4.2677393 if "speech-commands" not in model_name else -6.845978 - std = 4.5689974 if "speech-commands" not in model_name else 5.5654526 - max_length = 1024 if "speech-commands" not in model_name else 128 - feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length) - - if "speech-commands" in model_name: - # TODO: Convert dataset to Parquet - dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True) - waveform = dataset[0]["audio"]["array"] - else: - filepath = hf_hub_download( - repo_id="nielsr/audio-spectogram-transformer-checkpoint", - filename="sample_audio.flac", - repo_type="dataset", - ) - - waveform, _ = torchaudio.load(filepath) - waveform = waveform.squeeze().numpy() - - inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt") - - # forward pass - outputs = model(**inputs) - logits = outputs.logits - - if model_name == "ast-finetuned-audioset-10-10-0.4593": - expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]) - elif model_name == "ast-finetuned-audioset-10-10-0.450": - expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718]) - elif model_name == "ast-finetuned-audioset-10-10-0.448": - expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344]) - elif model_name == "ast-finetuned-audioset-10-10-0.448-v2": - expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917]) - elif model_name == "ast-finetuned-audioset-12-12-0.447": - expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843]) - elif model_name == "ast-finetuned-audioset-14-14-0.443": - expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413]) - elif model_name == "ast-finetuned-audioset-16-16-0.442": - expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470]) - elif model_name == "ast-finetuned-speech-commands-v2": - expected_slice = torch.tensor([6.1589, -8.0566, -8.7984]) - else: - raise ValueError("Unknown model name") - if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4): - raise ValueError("Logits don't match") - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving feature extractor to {pytorch_dump_folder_path}") - feature_extractor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and feature extractor to the hub...") - model.push_to_hub(f"MIT/{model_name}") - feature_extractor.push_to_hub(f"MIT/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="ast-finetuned-audioset-10-10-0.4593", - type=str, - help="Name of the Audio Spectrogram Transformer model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py b/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py deleted file mode 100644 index 92ddcc88d4..0000000000 --- a/src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding=utf-8 -# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed.""" - -import argparse -import json -import os -import re -from os import path -from typing import Dict, Optional, Union - -import torch -from huggingface_hub import split_torch_state_dict_into_shards -from safetensors.torch import save_file - -from transformers import AutoTokenizer -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME - -from .configuration_bamba import BambaConfig - - -def convert_state_dict_from_mamba_ssm(original_sd: Dict) -> Dict[str, torch.Tensor]: - state_dict = {} - - for orig_k, param in original_sd.items(): - k = orig_k.replace("backbone", "model") - - # for embeddings - k = k.replace("embedding", "embed_tokens") - - # for mixer - k = k.replace("mixer", "mamba") - - # for final layernorm - k = k.replace("norm_f", "final_layernorm") - - # for block layernorm - k = re.sub(r"(\d+)\.norm\.", r"\1.input_layernorm.", k) - k = re.sub(r"(\d+)\.norm2\.", r"\1.pre_ff_layernorm.", k) - - # for mlp - k = k.replace("mlp.fc2", "feed_forward.down_proj") - - if "mlp.fc1" in k: - param, param2 = torch.chunk(param, 2, dim=0) - k2 = k.replace("mlp.fc1", "feed_forward.gate_proj") - state_dict[k2] = param2 - k = k.replace("mlp.fc1", "feed_forward.up_proj") - - if ("in_proj" in k and orig_k.replace("in_proj", "conv1d") in original_sd) or ( - "out_proj" in k and orig_k.replace("out_proj", "conv1d") in original_sd - ): - # then this must be a mamba - pass - else: - # for attn - # - because mixer was replaced to mamba above - k = k.replace("mamba.out_proj", "self_attn.o_proj") - if "mamba.in_proj" in k: - m, n = param.shape - d = (m - n) // 2 - param, param2, param3 = torch.split(param, [n, d, d], dim=0) - k2 = k.replace("mamba.in_proj", "self_attn.k_proj") - state_dict[k2] = param2 - k2 = k.replace("mamba.in_proj", "self_attn.v_proj") - state_dict[k2] = param3 - k = k.replace("mamba.in_proj", "self_attn.q_proj") - - state_dict[k] = param - - return state_dict - - -# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py -def convert_ssm_config_to_hf_config( - config_ssm: Dict, - **kwargs, -) -> BambaConfig: - """Convert a config from mamba_ssm to a BambaConfig from here.""" - hf_config: BambaConfig = BambaConfig(**kwargs) - - hf_config.architectures = ["BambaForCausalLM"] - - # Set important values from config and recalculate other resulting entries - hf_config.hidden_size = config_ssm["d_model"] - hf_config.intermediate_size = config_ssm["d_intermediate"] - hf_config.mamba_n_heads = (hf_config.hidden_size * hf_config.mamba_expand) // hf_config.mamba_d_head - hf_config.num_hidden_layers = config_ssm["n_layer"] - hf_config.tie_word_embeddings = config_ssm["tie_embeddings"] - - # currently this script assumes config_ssm belongs to v2 - if config_ssm["ssm_cfg"].get("layer") != "Mamba2": - raise ValueError("Conversion script only supports Mamba2") - - # Set attention values - attn_cfg = config_ssm.get("attn_cfg") - if attn_cfg: - assert attn_cfg["causal"], "Only support non-causal attention." - assert not attn_cfg["qkv_proj_bias"], "Only support no qkv bias." - assert not attn_cfg["out_proj_bias"], "Only support no out bias." - hf_config.attn_rotary_emb = attn_cfg["rotary_emb_dim"] - hf_config.num_attention_heads = attn_cfg["num_heads"] - hf_config.num_key_value_heads = attn_cfg["num_heads_kv"] - - attention_layer_indices = config_ssm.get("attn_layer_idx") - if attention_layer_indices: - hf_config.attn_layer_indices = attention_layer_indices - - # Padded vocab size, mostly of 16 but 32 is also very common in different models - vocab_size = config_ssm["vocab_size"] - pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"] - if (vocab_size % pad_vocab_size_multiple) != 0: - vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple) - hf_config.vocab_size = vocab_size - - return hf_config - - -def save_single_safetensor( - state_dict: Dict, - save_directory: str, - metadata: Dict, -): - save_file( - state_dict, - os.path.join(save_directory, SAFE_WEIGHTS_NAME), - metadata, - ) - - -def save_sharded_safetensors( - state_dict: Dict, - save_directory: str, - metadata: Dict, - max_shard_size: Union[int, str] = "5GB", -): - filename_pattern = SAFE_WEIGHTS_NAME.replace(".bin", "{suffix}.bin").replace( - ".safetensors", "{suffix}.safetensors" - ) - state_dict_split = split_torch_state_dict_into_shards( - state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size - ) - index = { - "metadata": state_dict_split.metadata, - "weight_map": state_dict_split.tensor_to_filename, - } - # Save the index - with open(os.path.join(save_directory, SAFE_WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f: - content = json.dumps(index, indent=2, sort_keys=True) + "\n" - f.write(content) - - filename_to_tensors = state_dict_split.filename_to_tensors.items() - for shard_file, tensors in filename_to_tensors: - shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors} - save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata) - - -# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py -def convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( - mamba_ssm_checkpoint_path: str, - precision: str, - output_dir: str, - tokenizer_path: Optional[str] = None, - save_model: Union[bool, str] = True, -) -> None: - # load tokenizer if provided, this will be used to set the - # token_ids in the config file - token_ids = {} - if tokenizer_path: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - for key in [ - "bos_token_id", - "eos_token_id", - "pad_token_id", - ]: - id = getattr(tokenizer, key, None) - if id: - token_ids[key] = id - - # there are some configs unsettable by mamba_ssn config, so - # if there are changes from the defaults, have to pass them into - # the function - unsettables = { - "mamba_d_head": 64, - "mamba_d_state": 128, - "mamba_n_groups": 1, - "rms_norm_eps": 1e-5, - } - - # Load and save config based on name - config_path = path.join(mamba_ssm_checkpoint_path, "config.json") - with open(config_path, "r", encoding="utf-8") as json_file: - config = json.load(json_file) - - # convert the config - hf_config = convert_ssm_config_to_hf_config( - config_ssm=config, - **token_ids, - **unsettables, - ) - hf_config.save_pretrained(output_dir) - - # Load state dict of the original model and transfer to hf model - state_dict = torch.load( - path.join(mamba_ssm_checkpoint_path, "pytorch_model.bin"), - map_location="cpu", - weights_only=True, - ) - # FIXME: allow other parameters to pass in - state_dict = convert_state_dict_from_mamba_ssm(state_dict) - - # Save new model to pytorch_dump_path - dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16) - - save_file_fn = None - if isinstance(save_model, bool) and save_model: - save_file_fn = save_single_safetensor - elif isinstance(save_model, str) and save_model == "sharded": - save_file_fn = save_sharded_safetensors - - if save_file_fn: - save_file_fn({k: v.to(dtype) for k, v in state_dict.items()}, output_dir, metadata={"format": "pt"}) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", - "--mamba_ssm_checkpoint_directory", - type=str, - required=True, - help="Path to a directory containing the `pytorch_model.bin` mamba_ssm checkpoint file to be converted.", - ) - parser.add_argument( - "-p", - "--precision", - type=str, - default="fp16", - const="fp16", - required=True, - choices=("fp32", "fp16", "bf16"), - help="The precision the model will be saved in. Select from fp32, fp16 or bf16.", - ) - parser.add_argument( - "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." - ) - parser.add_argument( - "-t", - "--tokenizer_model_path", - type=str, - default=None, - required=False, - help="Path to a the tokenizer file.", - ) - args = parser.parse_args() - - convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( - args.mamba2_checkpoint_directory, - args.precision, - args.output_dir, - ) diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py deleted file mode 100644 index 803656b623..0000000000 --- a/src/transformers/models/bark/convert_suno_to_hf.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Convert Bark checkpoint.""" - -import argparse -import os -from pathlib import Path - -import torch -from bark.generation import _load_model as _bark_load_model -from huggingface_hub import hf_hub_download - -from transformers import EncodecConfig, EncodecModel, set_seed -from transformers.models.bark.configuration_bark import ( - BarkCoarseConfig, - BarkConfig, - BarkFineConfig, - BarkSemanticConfig, -) -from transformers.models.bark.generation_configuration_bark import ( - BarkCoarseGenerationConfig, - BarkFineGenerationConfig, - BarkGenerationConfig, - BarkSemanticGenerationConfig, -) -from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -set_seed(770) - - -new_layer_name_dict = { - "c_attn": "att_proj", - "c_proj": "out_proj", - "c_fc": "in_proj", - "transformer.": "", - "h.": "layers.", - "ln_1": "layernorm_1", - "ln_2": "layernorm_2", - "ln_f": "layernorm_final", - "wpe": "position_embeds_layer", - "wte": "input_embeds_layer", -} - - -REMOTE_MODEL_PATHS = { - "text_small": { - "repo_id": "suno/bark", - "file_name": "text.pt", - }, - "coarse_small": { - "repo_id": "suno/bark", - "file_name": "coarse.pt", - }, - "fine_small": { - "repo_id": "suno/bark", - "file_name": "fine.pt", - }, - "text": { - "repo_id": "suno/bark", - "file_name": "text_2.pt", - }, - "coarse": { - "repo_id": "suno/bark", - "file_name": "coarse_2.pt", - }, - "fine": { - "repo_id": "suno/bark", - "file_name": "fine_2.pt", - }, -} - -CUR_PATH = os.path.dirname(os.path.abspath(__file__)) -default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache") -CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0") - - -def _get_ckpt_path(model_type, use_small=False): - key = model_type - if use_small: - key += "_small" - return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"]) - - -def _download(from_hf_path, file_name): - os.makedirs(CACHE_DIR, exist_ok=True) - hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR) - - -def _load_model(ckpt_path, device, use_small=False, model_type="text"): - if model_type == "text": - ModelClass = BarkSemanticModel - ConfigClass = BarkSemanticConfig - GenerationConfigClass = BarkSemanticGenerationConfig - elif model_type == "coarse": - ModelClass = BarkCoarseModel - ConfigClass = BarkCoarseConfig - GenerationConfigClass = BarkCoarseGenerationConfig - elif model_type == "fine": - ModelClass = BarkFineModel - ConfigClass = BarkFineConfig - GenerationConfigClass = BarkFineGenerationConfig - else: - raise NotImplementedError() - model_key = f"{model_type}_small" if use_small else model_type - model_info = REMOTE_MODEL_PATHS[model_key] - if not os.path.exists(ckpt_path): - logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.") - _download(model_info["repo_id"], model_info["file_name"]) - checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True) - # this is a hack - model_args = checkpoint["model_args"] - if "input_vocab_size" not in model_args: - model_args["input_vocab_size"] = model_args["vocab_size"] - model_args["output_vocab_size"] = model_args["vocab_size"] - del model_args["vocab_size"] - - # convert Bark model arguments to HF Bark model arguments - model_args["num_heads"] = model_args.pop("n_head") - model_args["hidden_size"] = model_args.pop("n_embd") - model_args["num_layers"] = model_args.pop("n_layer") - - model_config = ConfigClass(**checkpoint["model_args"]) - model = ModelClass(config=model_config) - model_generation_config = GenerationConfigClass() - - model.generation_config = model_generation_config - state_dict = checkpoint["model"] - # fixup checkpoint - unwanted_prefix = "_orig_mod." - for k, v in list(state_dict.items()): - if k.startswith(unwanted_prefix): - # replace part of the key with corresponding layer name in HF implementation - new_k = k[len(unwanted_prefix) :] - for old_layer_name in new_layer_name_dict: - new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name]) - - state_dict[new_k] = state_dict.pop(k) - - extra_keys = set(state_dict.keys()) - set(model.state_dict().keys()) - extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")} - missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) - missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")} - if len(extra_keys) != 0: - raise ValueError(f"extra keys found: {extra_keys}") - if len(missing_keys) != 0: - raise ValueError(f"missing keys: {missing_keys}") - model.load_state_dict(state_dict, strict=False) - n_params = model.num_parameters(exclude_embeddings=True) - val_loss = checkpoint["best_val_loss"].item() - logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params, {round(val_loss, 3)} loss") - model.eval() - model.to(device) - del checkpoint, state_dict - - return model - - -def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"): - if model_type not in ("text", "coarse", "fine"): - raise NotImplementedError() - - device = "cpu" # do conversion on cpu - - ckpt_path = _get_ckpt_path(model_type, use_small=use_small) - model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small) - - # load bark initial model - bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small) - - if model_type == "text": - bark_model = bark_model["model"] - - if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params(): - raise ValueError("initial and new models don't have the same number of parameters") - - # check if same output as the bark model - batch_size = 5 - sequence_length = 10 - - if model_type in ["text", "coarse"]: - vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int) - output_old_model = bark_model(vec)[0] - - output_new_model_total = model(vec) - - # take last logits - output_new_model = output_new_model_total.logits[:, [-1], :] - - else: - prediction_codebook_channel = 3 - n_codes_total = 8 - vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int) - - output_new_model_total = model(prediction_codebook_channel, vec) - output_old_model = bark_model(prediction_codebook_channel, vec) - - output_new_model = output_new_model_total.logits - - # output difference should come from the difference of self-attention implementation design - if output_new_model.shape != output_old_model.shape: - raise ValueError("initial and new outputs don't have the same shape") - if (output_new_model - output_old_model).abs().max().item() > 1e-3: - raise ValueError("initial and new outputs are not equal") - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - - -def load_whole_bark_model( - semantic_path, - coarse_path, - fine_path, - append_text, - hub_path, - folder_path, -): - pytorch_dump_folder_path = os.path.join(folder_path, append_text) - - semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json")) - coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json")) - fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json")) - codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz") - - semantic = BarkSemanticModel.from_pretrained(semantic_path) - coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path) - fineAcoustic = BarkFineModel.from_pretrained(fine_path) - codec = EncodecModel.from_pretrained("facebook/encodec_24khz") - - bark_config = BarkConfig.from_sub_model_configs( - semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig - ) - - bark_generation_config = BarkGenerationConfig.from_sub_model_configs( - semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config - ) - - bark = BarkModel(bark_config) - - bark.semantic = semantic - bark.coarse_acoustics = coarseAcoustic - bark.fine_acoustics = fineAcoustic - bark.codec_model = codec - - bark.generation_config = bark_generation_config - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - - parser.add_argument("model_type", type=str, help="text, coarse or fine.") - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.") - - args = parser.parse_args() - - load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small) diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 84dc415443..0000000000 --- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BART checkpoint.""" - -import argparse -import os -from pathlib import Path - -import fairseq -import torch -from packaging import version -from torch import nn - -from transformers import ( - BartConfig, - BartForConditionalGeneration, - BartForSequenceClassification, - BartModel, - BartTokenizer, -) -from transformers.utils import logging - - -FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"] -extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification} -if version.parse(fairseq.__version__) < version.parse("0.9.0"): - raise Exception("requires fairseq >= 0.9.0") - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = " Hello world! cĂ©cĂ© herlolip" - -mnli_rename_keys = [ - ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), - ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), - ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), - ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), -] - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "encoder.version", - "decoder.version", - "model.encoder.version", - "model.decoder.version", - "_float_tensor", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def load_xsum_checkpoint(checkpoint_path): - """Checkpoint path should end in model.pt""" - sd = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval() - hub_interface.model.load_state_dict(sd["model"]) - return hub_interface - - -def make_linear_from_emb(emb): - vocab_size, emb_size = emb.weight.shape - lin_layer = nn.Linear(vocab_size, emb_size, bias=False) - lin_layer.weight.data = emb.weight.data - return lin_layer - - -@torch.no_grad() -def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): - """ - Copy/paste/tweak model's weights to our BERT structure. - """ - if not os.path.exists(checkpoint_path): - bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() - else: - bart = load_xsum_checkpoint(checkpoint_path) - - bart.model.upgrade_state_dict(bart.model.state_dict()) - if hf_checkpoint_name is None: - hf_checkpoint_name = checkpoint_path.replace(".", "-") - config = BartConfig.from_pretrained(hf_checkpoint_name) - tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) - tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) - if not torch.eq(tokens, tokens2).all(): - raise ValueError( - f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}" - ) - - if checkpoint_path == "bart.large.mnli": - state_dict = bart.state_dict() - remove_ignore_keys_(state_dict) - state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] - for src, dest in mnli_rename_keys: - rename_key(state_dict, src, dest) - model = BartForSequenceClassification(config).eval() - model.load_state_dict(state_dict) - fairseq_output = bart.predict("mnli", tokens, return_logits=True) - new_model_outputs = model(tokens)[0] # logits - else: # no classification heads to worry about - state_dict = bart.model.state_dict() - remove_ignore_keys_(state_dict) - state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] - fairseq_output = bart.extract_features(tokens) - if hf_checkpoint_name == "facebook/bart-large": - model = BartModel(config).eval() - model.load_state_dict(state_dict) - new_model_outputs = model(tokens).model[0] - else: - model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt - model.model.load_state_dict(state_dict) - if hasattr(model, "lm_head"): - model.lm_head = make_linear_from_emb(model.model.shared) - new_model_outputs = model.model(tokens)[0] - - # Check results - if fairseq_output.shape != new_model_outputs.shape: - raise ValueError( - f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}" - ) - if (fairseq_output != new_model_outputs).any().item(): - raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." - ) - parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum" - ) - args = parser.parse_args() - convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config) diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py deleted file mode 100644 index 46c72a97f4..0000000000 --- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py +++ /dev/null @@ -1,373 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BEiT checkpoints from the unilm repository.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from datasets import load_dataset -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ( - BeitConfig, - BeitForImageClassification, - BeitForMaskedImageModeling, - BeitForSemanticSegmentation, - BeitImageProcessor, -) -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, has_lm_head=False, is_semantic=False): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", "beit.embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"), - ] - ) - - if has_lm_head: - # mask token + shared relative position bias + layernorm - rename_keys.extend( - [ - ("mask_token", "beit.embeddings.mask_token"), - ( - "rel_pos_bias.relative_position_bias_table", - "beit.encoder.relative_position_bias.relative_position_bias_table", - ), - ( - "rel_pos_bias.relative_position_index", - "beit.encoder.relative_position_bias.relative_position_index", - ), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - elif is_semantic: - # semantic segmentation classification heads - rename_keys.extend( - [ - ("decode_head.conv_seg.weight", "decode_head.classifier.weight"), - ("decode_head.conv_seg.bias", "decode_head.classifier.bias"), - ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"), - ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", "beit.pooler.layernorm.weight"), - ("fc_norm.bias", "beit.pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2 - - # relative_position bias table + index - if not has_lm_head: - # each layer has its own relative position bias - table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table") - index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index") - - state_dict[ - f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table" - ] = table - state_dict[ - f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index" - ] = index - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our BEiT structure. - """ - - # define default BEiT configuration - config = BeitConfig() - has_lm_head = False - is_semantic = False - repo_id = "huggingface/label-files" - # set config parameters based on URL - if checkpoint_url[-9:-4] == "pt22k": - # masked image modeling - config.use_shared_relative_position_bias = True - config.use_mask_token = True - has_lm_head = True - elif checkpoint_url[-9:-4] == "ft22k": - # intermediate fine-tuning on ImageNet-22k - config.use_relative_position_bias = True - config.num_labels = 21841 - filename = "imagenet-22k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - # this dataset contains 21843 labels but the model only has 21841 - # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 - del id2label[9205] - del id2label[15027] - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - elif checkpoint_url[-8:-4] == "to1k": - # fine-tuning on ImageNet-1k - config.use_relative_position_bias = True - config.num_labels = 1000 - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - if "384" in checkpoint_url: - config.image_size = 384 - if "512" in checkpoint_url: - config.image_size = 512 - elif "ade20k" in checkpoint_url: - # fine-tuning - config.use_relative_position_bias = True - config.num_labels = 150 - filename = "ade20k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.image_size = 640 - is_semantic = True - else: - raise ValueError("Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'") - - # size of the architecture - if "base" in checkpoint_url: - pass - elif "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - if "ade20k" in checkpoint_url: - config.image_size = 640 - config.out_indices = [7, 11, 15, 23] - else: - raise ValueError("Should either find 'base' or 'large' in checkpoint URL") - - # load state_dict of original model, remove and rename some keys - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True) - state_dict = state_dict["model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"] - - rename_keys = create_rename_keys(config, has_lm_head=has_lm_head, is_semantic=is_semantic) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head, is_semantic=is_semantic) - if is_semantic: - # add prefix to decoder keys - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("backbone.fpn"): - key = key.replace("backbone.fpn", "fpn") - state_dict[key] = val - - # load HuggingFace model - if checkpoint_url[-9:-4] == "pt22k": - model = BeitForMaskedImageModeling(config) - elif "ade20k" in checkpoint_url: - model = BeitForSemanticSegmentation(config) - else: - model = BeitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # Check outputs on an image - if is_semantic: - image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) - ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) - image = Image.open(ds[0]["file"]) - else: - image_processor = BeitImageProcessor( - size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False - ) - image = prepare_img() - - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - outputs = model(pixel_values) - logits = outputs.logits - - # verify logits - expected_shape = torch.Size([1, 1000]) - if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"): - expected_shape = torch.Size([1, 196, 8192]) - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"): - expected_shape = torch.Size([1, 196, 8192]) - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"): - expected_shape = torch.Size([1, 21841]) - expected_logits = torch.tensor([2.2288, 2.4671, 0.7395]) - expected_class_idx = 2397 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"): - expected_shape = torch.Size([1, 21841]) - expected_logits = torch.tensor([1.6881, -0.2787, 0.5901]) - expected_class_idx = 2396 - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"): - expected_logits = torch.tensor([0.1241, 0.0798, -0.6569]) - expected_class_idx = 285 - elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108]) - expected_class_idx = 281 - elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"): - expected_logits = torch.tensor([0.4610, -0.0928, 0.2086]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_384_pt22k_ft22kto1k"): - expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_large_patch16_512_pt22k_ft22kto1k"): - expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852]) - expected_class_idx = 761 - elif checkpoint_url[:-4].endswith("beit_base_patch16_640_pt22k_ft22ktoade20k"): - expected_shape = (1, 150, 160, 160) - expected_logits = torch.tensor( - [ - [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]], - [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]], - [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]], - ] - ) - elif checkpoint_url[:-4].endswith("beit_large_patch16_640_pt22k_ft22ktoade20k"): - expected_shape = (1, 150, 160, 160) - expected_logits = torch.tensor( - [ - [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251], [-3.4198, -1.8004, -2.9062]], - [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755], [-4.2791, -3.1874, -4.1681]], - [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518], [4.5550, 6.2495, 6.5154]], - ] - ) - else: - raise ValueError("Can't verify logits as model is not supported") - - if logits.shape != expected_shape: - raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}") - if not has_lm_head: - if is_semantic: - if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3): - raise ValueError("First elements of logits not as expected") - else: - print("Predicted class idx:", logits.argmax(-1).item()) - - if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3): - raise ValueError("First elements of logits not as expected") - if logits.argmax(-1).item() != expected_class_idx: - raise ValueError("Predicted class index not as expected") - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_url", - default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_beit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py deleted file mode 100644 index 9dfd8da474..0000000000 --- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now -deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert - -TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert -weight names to the original names, so the model can be imported with Huggingface/transformer. - -You may adapt this script to include classification/MLM/NSP/etc. heads. - -Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0). - Models trained with never versions are not compatible with this script. -""" - -import argparse -import os -import re - -import tensorflow as tf -import torch - -from transformers import BertConfig, BertModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def load_tf2_weights_in_bert(model, tf_checkpoint_path, config): - tf_path = os.path.abspath(tf_checkpoint_path) - logger.info(f"Converting TensorFlow checkpoint from {tf_path}") - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - arrays = [] - layer_depth = [] - for full_name, shape in init_vars: - # logger.info(f"Loading TF weight {name} with shape {shape}") - name = full_name.split("/") - if full_name == "_CHECKPOINTABLE_OBJECT_GRAPH" or name[0] in ["global_step", "save_counter"]: - logger.info(f"Skipping non-model layer {full_name}") - continue - if "optimizer" in full_name: - logger.info(f"Skipping optimization layer {full_name}") - continue - if name[0] == "model": - # ignore initial 'model' - name = name[1:] - # figure out how many levels deep the name is - depth = 0 - for _name in name: - if _name.startswith("layer_with_weights"): - depth += 1 - else: - break - layer_depth.append(depth) - # read data - array = tf.train.load_variable(tf_path, full_name) - names.append("/".join(name)) - arrays.append(array) - logger.info(f"Read a total of {len(arrays):,} layers") - - # Sanity check - if len(set(layer_depth)) != 1: - raise ValueError(f"Found layer names with different depths (layer depth {list(set(layer_depth))})") - layer_depth = list(set(layer_depth))[0] - if layer_depth != 1: - raise ValueError( - "The model contains more than just the embedding/encoder layers. This script does not handle MLM/NSP" - " heads." - ) - - # convert layers - logger.info("Converting weights...") - for full_name, array in zip(names, arrays): - name = full_name.split("/") - pointer = model - trace = [] - for i, m_name in enumerate(name): - if m_name == ".ATTRIBUTES": - # variable names end with .ATTRIBUTES/VARIABLE_VALUE - break - if m_name.startswith("layer_with_weights"): - layer_num = int(m_name.split("-")[-1]) - if layer_num <= 2: - # embedding layers - # layer_num 0: word_embeddings - # layer_num 1: position_embeddings - # layer_num 2: token_type_embeddings - continue - elif layer_num == 3: - # embedding LayerNorm - trace.extend(["embeddings", "LayerNorm"]) - pointer = getattr(pointer, "embeddings") - pointer = getattr(pointer, "LayerNorm") - elif layer_num > 3 and layer_num < config.num_hidden_layers + 4: - # encoder layers - trace.extend(["encoder", "layer", str(layer_num - 4)]) - pointer = getattr(pointer, "encoder") - pointer = getattr(pointer, "layer") - pointer = pointer[layer_num - 4] - elif layer_num == config.num_hidden_layers + 4: - # pooler layer - trace.extend(["pooler", "dense"]) - pointer = getattr(pointer, "pooler") - pointer = getattr(pointer, "dense") - elif m_name == "embeddings": - trace.append("embeddings") - pointer = getattr(pointer, "embeddings") - if layer_num == 0: - trace.append("word_embeddings") - pointer = getattr(pointer, "word_embeddings") - elif layer_num == 1: - trace.append("position_embeddings") - pointer = getattr(pointer, "position_embeddings") - elif layer_num == 2: - trace.append("token_type_embeddings") - pointer = getattr(pointer, "token_type_embeddings") - else: - raise ValueError(f"Unknown embedding layer with name {full_name}") - trace.append("weight") - pointer = getattr(pointer, "weight") - elif m_name == "_attention_layer": - # self-attention layer - trace.extend(["attention", "self"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "self") - elif m_name == "_attention_layer_norm": - # output attention norm - trace.extend(["attention", "output", "LayerNorm"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "LayerNorm") - elif m_name == "_attention_output_dense": - # output attention dense - trace.extend(["attention", "output", "dense"]) - pointer = getattr(pointer, "attention") - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "dense") - elif m_name == "_output_dense": - # output dense - trace.extend(["output", "dense"]) - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "dense") - elif m_name == "_output_layer_norm": - # output dense - trace.extend(["output", "LayerNorm"]) - pointer = getattr(pointer, "output") - pointer = getattr(pointer, "LayerNorm") - elif m_name == "_key_dense": - # attention key - trace.append("key") - pointer = getattr(pointer, "key") - elif m_name == "_query_dense": - # attention query - trace.append("query") - pointer = getattr(pointer, "query") - elif m_name == "_value_dense": - # attention value - trace.append("value") - pointer = getattr(pointer, "value") - elif m_name == "_intermediate_dense": - # attention intermediate dense - trace.extend(["intermediate", "dense"]) - pointer = getattr(pointer, "intermediate") - pointer = getattr(pointer, "dense") - elif m_name == "_output_layer_norm": - # output layer norm - trace.append("output") - pointer = getattr(pointer, "output") - # weights & biases - elif m_name in ["bias", "beta"]: - trace.append("bias") - pointer = getattr(pointer, "bias") - elif m_name in ["kernel", "gamma"]: - trace.append("weight") - pointer = getattr(pointer, "weight") - else: - logger.warning(f"Ignored {m_name}") - # for certain layers reshape is necessary - trace = ".".join(trace) - if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or re.match( - r"(\S+)\.attention\.output\.dense\.weight", trace - ): - array = array.reshape(pointer.data.shape) - if "kernel" in full_name: - array = array.transpose() - if pointer.shape == array.shape: - pointer.data = torch.from_numpy(array) - else: - raise ValueError( - f"Shape mismatch in layer {full_name}: Model expects shape {pointer.shape} but layer contains shape:" - f" {array.shape}" - ) - logger.info(f"Successfully set variable {full_name} to PyTorch layer {trace}") - return model - - -def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, pytorch_dump_path): - # Instantiate model - logger.info(f"Loading model based on config from {config_path}...") - config = BertConfig.from_json_file(config_path) - model = BertModel(config) - - # Load weights from checkpoint - logger.info(f"Loading weights from checkpoint {tf_checkpoint_path}...") - load_tf2_weights_in_bert(model, tf_checkpoint_path, config) - - # Save pytorch-model - logger.info(f"Saving PyTorch model to {pytorch_dump_path}...") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - type=str, - required=True, - help="The config json file corresponding to the BERT model. This specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", - type=str, - required=True, - help="Path to the output PyTorch model (must include filename).", - ) - args = parser.parse_args() - convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index be904ddd7e..0000000000 --- a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BERT checkpoint.""" - -import argparse - -import torch - -from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): - # Initialise PyTorch model - config = BertConfig.from_json_file(bert_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = BertForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_bert(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py deleted file mode 100644 index 8e1e85d5c0..0000000000 --- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py +++ /dev/null @@ -1,112 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" - -import argparse -import os - -import numpy as np -import tensorflow as tf -import torch - -from transformers import BertModel - - -def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): - """ - Args: - model: BertModel Pytorch model instance to be converted - ckpt_dir: Tensorflow model directory - model_name: model name - - Currently supported HF models: - - - Y BertModel - - N BertForMaskedLM - - N BertForPreTraining - - N BertForMultipleChoice - - N BertForNextSentencePrediction - - N BertForSequenceClassification - - N BertForQuestionAnswering - """ - - tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") - - var_map = ( - ("layer.", "layer_"), - ("word_embeddings.weight", "word_embeddings"), - ("position_embeddings.weight", "position_embeddings"), - ("token_type_embeddings.weight", "token_type_embeddings"), - (".", "/"), - ("LayerNorm/weight", "LayerNorm/gamma"), - ("LayerNorm/bias", "LayerNorm/beta"), - ("weight", "kernel"), - ) - - if not os.path.isdir(ckpt_dir): - os.makedirs(ckpt_dir) - - state_dict = model.state_dict() - - def to_tf_var_name(name: str): - for patt, repl in iter(var_map): - name = name.replace(patt, repl) - return f"bert/{name}" - - def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): - tf_dtype = tf.dtypes.as_dtype(tensor.dtype) - tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) - session.run(tf.variables_initializer([tf_var])) - session.run(tf_var) - return tf_var - - tf.reset_default_graph() - with tf.Session() as session: - for var_name in state_dict: - tf_name = to_tf_var_name(var_name) - torch_tensor = state_dict[var_name].numpy() - if any(x in var_name for x in tensors_to_transpose): - torch_tensor = torch_tensor.T - tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) - tf_var.assign(tf.cast(torch_tensor, tf_var.dtype)) - tf_weight = session.run(tf_var) - print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}") - - saver = tf.train.Saver(tf.trainable_variables()) - saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) - - -def main(raw_args=None): - parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, required=True, help="model name e.g. google-bert/bert-base-uncased") - parser.add_argument( - "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" - ) - parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") - parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") - args = parser.parse_args(raw_args) - - model = BertModel.from_pretrained( - pretrained_model_name_or_path=args.model_name, - state_dict=torch.load(args.pytorch_model_path, weights_only=True), - cache_dir=args.cache_dir, - ) - - convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py deleted file mode 100644 index a7832a53d5..0000000000 --- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -This script converts a lm-head checkpoint from the "Token Dropping" implementation into a PyTorch-compatible BERT -model. The official implementation of "Token Dropping" can be found in the TensorFlow Models repository: - -https://github.com/tensorflow/models/tree/master/official/projects/token_dropping -""" - -import argparse - -import tensorflow as tf -import torch - -from transformers import BertConfig, BertForMaskedLM -from transformers.models.bert.modeling_bert import ( - BertIntermediate, - BertLayer, - BertOutput, - BertPooler, - BertSelfAttention, - BertSelfOutput, -) -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pytorch_dump_path: str): - def get_masked_lm_array(name: str): - full_name = f"masked_lm/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_array(name: str): - full_name = f"encoder/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_layer_array(layer_index: int, name: str): - full_name = f"encoder/_transformer_layers/{layer_index}/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - def get_encoder_attention_layer_array(layer_index: int, name: str, original_shape): - full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE" - array = tf.train.load_variable(tf_checkpoint_path, full_name) - array = array.reshape(original_shape) - - if "kernel" in name: - array = array.transpose() - - return torch.from_numpy(array) - - print(f"Loading model based on config from {config_path}...") - config = BertConfig.from_json_file(config_path) - model = BertForMaskedLM(config) - - # Layers - for layer_index in range(0, config.num_hidden_layers): - layer: BertLayer = model.bert.encoder.layer[layer_index] - - # Self-attention - self_attn: BertSelfAttention = layer.attention.self - - self_attn.query.weight.data = get_encoder_attention_layer_array( - layer_index, "_query_dense/kernel", self_attn.query.weight.data.shape - ) - self_attn.query.bias.data = get_encoder_attention_layer_array( - layer_index, "_query_dense/bias", self_attn.query.bias.data.shape - ) - self_attn.key.weight.data = get_encoder_attention_layer_array( - layer_index, "_key_dense/kernel", self_attn.key.weight.data.shape - ) - self_attn.key.bias.data = get_encoder_attention_layer_array( - layer_index, "_key_dense/bias", self_attn.key.bias.data.shape - ) - self_attn.value.weight.data = get_encoder_attention_layer_array( - layer_index, "_value_dense/kernel", self_attn.value.weight.data.shape - ) - self_attn.value.bias.data = get_encoder_attention_layer_array( - layer_index, "_value_dense/bias", self_attn.value.bias.data.shape - ) - - # Self-attention Output - self_output: BertSelfOutput = layer.attention.output - - self_output.dense.weight.data = get_encoder_attention_layer_array( - layer_index, "_output_dense/kernel", self_output.dense.weight.data.shape - ) - self_output.dense.bias.data = get_encoder_attention_layer_array( - layer_index, "_output_dense/bias", self_output.dense.bias.data.shape - ) - - self_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/gamma") - self_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/beta") - - # Intermediate - intermediate: BertIntermediate = layer.intermediate - - intermediate.dense.weight.data = get_encoder_layer_array(layer_index, "_intermediate_dense/kernel") - intermediate.dense.bias.data = get_encoder_layer_array(layer_index, "_intermediate_dense/bias") - - # Output - bert_output: BertOutput = layer.output - - bert_output.dense.weight.data = get_encoder_layer_array(layer_index, "_output_dense/kernel") - bert_output.dense.bias.data = get_encoder_layer_array(layer_index, "_output_dense/bias") - - bert_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_output_layer_norm/gamma") - bert_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_output_layer_norm/beta") - - # Embeddings - model.bert.embeddings.position_embeddings.weight.data = get_encoder_array("_position_embedding_layer/embeddings") - model.bert.embeddings.token_type_embeddings.weight.data = get_encoder_array("_type_embedding_layer/embeddings") - model.bert.embeddings.LayerNorm.weight.data = get_encoder_array("_embedding_norm_layer/gamma") - model.bert.embeddings.LayerNorm.bias.data = get_encoder_array("_embedding_norm_layer/beta") - - # LM Head - lm_head = model.cls.predictions.transform - - lm_head.dense.weight.data = get_masked_lm_array("dense/kernel") - lm_head.dense.bias.data = get_masked_lm_array("dense/bias") - - lm_head.LayerNorm.weight.data = get_masked_lm_array("layer_norm/gamma") - lm_head.LayerNorm.bias.data = get_masked_lm_array("layer_norm/beta") - - model.bert.embeddings.word_embeddings.weight.data = get_masked_lm_array("embedding_table") - - # Pooling - model.bert.pooler = BertPooler(config=config) - model.bert.pooler.dense.weight.data: BertPooler = get_encoder_array("_pooler_layer/kernel") - model.bert.pooler.dense.bias.data: BertPooler = get_encoder_array("_pooler_layer/bias") - - # Export final model - model.save_pretrained(pytorch_dump_path) - - # Integration test - should load without any errors ;) - new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path) - print(new_model.eval()) - - print("Model conversion was done successfully!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow Token Dropping checkpoint path." - ) - parser.add_argument( - "--bert_config_file", - type=str, - required=True, - help="The config json file corresponding to the BERT model. This specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", - type=str, - required=True, - help="Path to the output PyTorch model.", - ) - args = parser.parse_args() - convert_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 0b8e6590f9..0000000000 --- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,69 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BigBird checkpoint.""" - -import argparse - -from transformers import BigBirdConfig, BigBirdForPreTraining, BigBirdForQuestionAnswering, load_tf_weights_in_big_bird -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa): - # Initialise PyTorch model - config = BigBirdConfig.from_json_file(big_bird_config_file) - print(f"Building PyTorch model from configuration: {config}") - - if is_trivia_qa: - model = BigBirdForQuestionAnswering(config) - else: - model = BigBirdForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--big_bird_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--is_trivia_qa", action="store_true", help="Whether to convert a model with a trivia_qa head." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.big_bird_config_file, args.pytorch_dump_path, args.is_trivia_qa - ) diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py deleted file mode 100644 index e17369e480..0000000000 --- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -from typing import Dict - -import tensorflow as tf -import torch -from tqdm import tqdm - -from transformers import BigBirdPegasusConfig, BigBirdPegasusForConditionalGeneration - - -INIT_COMMON = [ - # tf -> hf - ("/", "."), - ("layer_", "layers."), - ("kernel", "weight"), - ("beta", "bias"), - ("gamma", "weight"), - ("pegasus", "model"), -] -END_COMMON = [ - (".output.dense", ".fc2"), - ("intermediate.LayerNorm", "final_layer_norm"), - ("intermediate.dense", "fc1"), -] - -DECODER_PATTERNS = ( - INIT_COMMON - + [ - ("attention.self.LayerNorm", "self_attn_layer_norm"), - ("attention.output.dense", "self_attn.out_proj"), - ("attention.self", "self_attn"), - ("attention.encdec.LayerNorm", "encoder_attn_layer_norm"), - ("attention.encdec_output.dense", "encoder_attn.out_proj"), - ("attention.encdec", "encoder_attn"), - ("key", "k_proj"), - ("value", "v_proj"), - ("query", "q_proj"), - ("decoder.LayerNorm", "decoder.layernorm_embedding"), - ] - + END_COMMON -) - -REMAINING_PATTERNS = ( - INIT_COMMON - + [ - ("embeddings.word_embeddings", "shared.weight"), - ("embeddings.position_embeddings", "embed_positions.weight"), - ("attention.self.LayerNorm", "self_attn_layer_norm"), - ("attention.output.dense", "self_attn.output"), - ("attention.self", "self_attn.self"), - ("encoder.LayerNorm", "encoder.layernorm_embedding"), - ] - + END_COMMON -) - -KEYS_TO_IGNORE = [ - "encdec/key/bias", - "encdec/query/bias", - "encdec/value/bias", - "self/key/bias", - "self/query/bias", - "self/value/bias", - "encdec_output/dense/bias", - "attention/output/dense/bias", -] - - -def rename_state_dict_key(k, patterns): - for tf_name, hf_name in patterns: - k = k.replace(tf_name, hf_name) - return k - - -def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration: - cfg = BigBirdPegasusConfig(**config_update) - torch_model = BigBirdPegasusForConditionalGeneration(cfg) - state_dict = torch_model.state_dict() - mapping = {} - - # separating decoder weights - decoder_weights = {k: tf_weights[k] for k in tf_weights if k.startswith("pegasus/decoder")} - remaining_weights = {k: tf_weights[k] for k in tf_weights if not k.startswith("pegasus/decoder")} - - for k, v in tqdm(decoder_weights.items(), "tf -> hf conversion"): - conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] - if any(conditions): - continue - patterns = DECODER_PATTERNS - new_k = rename_state_dict_key(k, patterns) - if new_k not in state_dict: - raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") - if any(True if i in k else False for i in ["dense", "query", "key", "value"]): - v = v.T - mapping[new_k] = torch.from_numpy(v) - assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" - - for k, v in tqdm(remaining_weights.items(), "tf -> hf conversion"): - conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] - if any(conditions): - continue - patterns = REMAINING_PATTERNS - new_k = rename_state_dict_key(k, patterns) - if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings": - raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") - if any(True if i in k else False for i in ["dense", "query", "key", "value"]): - v = v.T - mapping[new_k] = torch.from_numpy(v) - if k != "pegasus/embeddings/position_embeddings": - assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" - - mapping["model.encoder.embed_positions.weight"] = mapping["model.embed_positions.weight"] - mapping["model.decoder.embed_positions.weight"] = mapping.pop("model.embed_positions.weight") - missing, extra = torch_model.load_state_dict(mapping, strict=False) - unexpected_missing = [ - k - for k in missing - if k - not in [ - "final_logits_bias", - "model.encoder.embed_tokens.weight", - "model.decoder.embed_tokens.weight", - "lm_head.weight", - ] - ] - assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}" - assert extra == [], f"no matches found for the following tf keys {extra}" - return torch_model - - -def get_tf_weights_as_numpy(path) -> Dict: - init_vars = tf.train.list_variables(path) - tf_weights = {} - ignore_name = ["global_step"] - for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"): - skip_key = any(pat in name for pat in ignore_name) - if skip_key: - continue - array = tf.train.load_variable(path, name) - tf_weights[name] = array - return tf_weights - - -def convert_bigbird_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str, config_update: dict): - tf_weights = get_tf_weights_as_numpy(ckpt_path) - torch_model = convert_bigbird_pegasus(tf_weights, config_update) - torch_model.save_pretrained(save_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--tf_ckpt_path", type=str, help="passed to tf.train.list_variables") - parser.add_argument("--save_dir", default=None, type=str, help="Path to the output PyTorch model.") - args = parser.parse_args() - config_update = {} - convert_bigbird_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir, config_update=config_update) diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index c390d2e39f..0000000000 --- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,292 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import json -import os -import re -import shutil - -import torch - -from transformers import BioGptConfig, BioGptForCausalLM -from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES -from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE -from transformers.utils import WEIGHTS_NAME, logging - - -logging.set_verbosity_warning() - -json_indent = 2 - - -# modified from https://github.com/facebookresearch/fairseq/blob/dd74992d0d143155998e9ed4076826bcea80fb06/fairseq/data/dictionary.py#L18 -class Dictionary: - """A mapping from symbols to consecutive integers""" - - def __init__( - self, - *, # begin keyword-only arguments - bos="", - pad="", - eos="", - unk="", - extra_special_symbols=None, - ): - self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos - self.symbols = [] - self.count = [] - self.indices = {} - self.bos_index = self.add_symbol(bos) - self.pad_index = self.add_symbol(pad) - self.eos_index = self.add_symbol(eos) - self.unk_index = self.add_symbol(unk) - if extra_special_symbols: - for s in extra_special_symbols: - self.add_symbol(s) - self.nspecial = len(self.symbols) - - def __eq__(self, other): - return self.indices == other.indices - - def __getitem__(self, idx): - if idx < len(self.symbols): - return self.symbols[idx] - return self.unk_word - - def __len__(self): - """Returns the number of symbols in the dictionary""" - return len(self.symbols) - - def __contains__(self, sym): - return sym in self.indices - - @classmethod - def load(cls, f): - """Loads the dictionary from a text file with the format: - - ``` - - - ... - ``` - """ - d = cls() - d.add_from_file(f) - return d - - def add_symbol(self, word, n=1, overwrite=False): - """Adds a word to the dictionary""" - if word in self.indices and not overwrite: - idx = self.indices[word] - self.count[idx] = self.count[idx] + n - return idx - else: - idx = len(self.symbols) - self.indices[word] = idx - self.symbols.append(word) - self.count.append(n) - return idx - - def _load_meta(self, lines): - return 0 - - def add_from_file(self, f): - """ - Loads a pre-existing dictionary from a text file and adds its symbols to this instance. - """ - if isinstance(f, str): - try: - with open(f, "r", encoding="utf-8") as fd: - self.add_from_file(fd) - except FileNotFoundError as fnfe: - raise fnfe - except UnicodeError: - raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f)) - return - - lines = f.readlines() - indices_start_line = self._load_meta(lines) - - for line in lines[indices_start_line:]: - try: - line, field = line.rstrip().rsplit(" ", 1) - if field == "#fairseq:overwrite": - overwrite = True - line, field = line.rsplit(" ", 1) - else: - overwrite = False - count = int(field) - word = line - if word in self and not overwrite: - raise RuntimeError( - "Duplicate word found when loading Dictionary: '{}'. " - "Duplicate words can overwrite earlier ones by adding the " - "#fairseq:overwrite flag at the end of the corresponding row " - "in the dictionary file. If using the Camembert model, please " - "download an updated copy of the model file.".format(word) - ) - self.add_symbol(word, n=count, overwrite=overwrite) - except ValueError: - raise ValueError("Incorrect dictionary format, expected ' [flags]'") - - -def rewrite_dict_keys(d): - # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up, - # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er': 7} - d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "", k), v) for k, v in d.items()) - keep_keys = " ".split() - # restore the special tokens - for k in keep_keys: - del d2[f"{k}"] - d2[k] = d[k] # restore - return d2 - - -def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path): - # prep - if not os.path.exists(biogpt_checkpoint_path): - raise ValueError(f"path {biogpt_checkpoint_path} does not exist!") - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - print(f"Writing results to {pytorch_dump_folder_path}") - - # handle various types of models - - checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt") - if not os.path.isfile(checkpoint_file): - raise ValueError(f"path to the file {checkpoint_file} does not exist!") - chkpt = torch.load(checkpoint_file, map_location="cpu", weights_only=True) - - args = chkpt["cfg"]["model"] - - # dicts - dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt") - if not os.path.isfile(dict_file): - raise ValueError(f"path to the file {dict_file} does not exist!") - src_dict = Dictionary.load(dict_file) - src_vocab = rewrite_dict_keys(src_dict.indices) - src_vocab_size = len(src_vocab) - src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"]) - print(f"Generating {src_vocab_file} of {src_vocab_size} records") - with open(src_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) - - # merges_file (bpecodes) - bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes") - if not os.path.isfile(bpecodes_file): - raise ValueError(f"path to the file {bpecodes_file} does not exist!") - - merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) - shutil.copyfile(bpecodes_file, merges_file) - - # model config - biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") - - model_conf = { - "activation_dropout": args["activation_dropout"], - "architectures": ["BioGptForCausalLM"], - "attention_probs_dropout_prob": args["attention_dropout"], - "bos_token_id": 0, - "eos_token_id": 2, - "hidden_act": args["activation_fn"], - "hidden_dropout_prob": args["dropout"], - "hidden_size": args["decoder_embed_dim"], - "initializer_range": 0.02, - "intermediate_size": args["decoder_ffn_embed_dim"], - "layer_norm_eps": 1e-12, - "layerdrop": args["decoder_layerdrop"], - "max_position_embeddings": args["max_target_positions"], - "model_type": "biogpt", - "num_attention_heads": args["decoder_attention_heads"], - "num_hidden_layers": args["decoder_layers"], - "pad_token_id": 1, - "scale_embedding": not args["no_scale_embedding"], - "tie_word_embeddings": args["share_decoder_input_output_embed"], - "vocab_size": src_vocab_size, - } - - # good hparam defaults to start with - - print(f"Generating {biogpt_model_config_file}") - with open(biogpt_model_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) - - # tokenizer config - biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) - - tokenizer_conf = { - "bos_token": "", - "eos_token": "", - "model_max_length": 1024, - "pad_token": "", - "special_tokens_map_file": None, - "tokenizer_class": "BioGptTokenizer", - "unk_token": "", - } - - print(f"Generating {biogpt_tokenizer_config_file}") - with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) - - # model - model_state_dict = chkpt["model"] - - # remove unneeded keys - ignore_keys = [ - "decoder.version", - ] - for k in ignore_keys: - model_state_dict.pop(k, None) - - layer_names = list(model_state_dict.keys()) - for layer_name in layer_names: - if layer_name.endswith("output_projection.weight"): - model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name) - else: - model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name) - - config = BioGptConfig.from_pretrained(pytorch_dump_folder_path) - model_new = BioGptForCausalLM(config) - - # check that it loads ok - model_new.load_state_dict(model_state_dict) - - # save - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - print(f"Generating {pytorch_weights_dump_path}") - torch.save(model_state_dict, pytorch_weights_dump_path) - - print("Conversion is done!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--biogpt_checkpoint_path", - default=None, - type=str, - required=True, - help=( - "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts," - " bpecodes, etc." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py deleted file mode 100644 index abc24290ab..0000000000 --- a/src/transformers/models/bit/convert_bit_to_pytorch.py +++ /dev/null @@ -1,177 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BiT checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm import create_model -from timm.data import resolve_data_config -from timm.data.transforms_factory import create_transform - -from transformers import BitConfig, BitForImageClassification, BitImageProcessor -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_config(model_name): - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - - conv_layer = "std_conv" if "bit" in model_name else False - - # note that when using BiT as backbone for ViT-hybrid checkpoints, - # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same", - # config.conv_layer = "std_conv_same" - config = BitConfig( - conv_layer=conv_layer, - num_labels=1000, - id2label=id2label, - label2id=label2id, - ) - - return config - - -def rename_key(name): - if "stem.conv" in name: - name = name.replace("stem.conv", "bit.embedder.convolution") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "head.fc" in name: - name = name.replace("head.fc", "classifier.1") - if name.startswith("norm"): - name = "bit." + name - if "bit" not in name and "classifier" not in name: - name = "bit.encoder." + name - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our BiT structure. - """ - - # define default BiT configuration - config = get_config(model_name) - - # load original model from timm - timm_model = create_model(model_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model - state_dict = timm_model.state_dict() - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val.squeeze() if "head" in key else val - - # load HuggingFace model - model = BitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # create image processor - transform = create_transform(**resolve_data_config({}, model=timm_model)) - timm_transforms = transform.transforms - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - processor = BitImageProcessor( - do_resize=True, - size={"shortest_edge": timm_transforms[0].size}, - resample=pillow_resamplings[timm_transforms[0].interpolation.value], - do_center_crop=True, - crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]}, - do_normalize=True, - image_mean=timm_transforms[-1].mean.tolist(), - image_std=timm_transforms[-1].std.tolist(), - ) - - image = prepare_img() - timm_pixel_values = transform(image).unsqueeze(0) - pixel_values = processor(image, return_tensors="pt").pixel_values - - # verify pixel values - assert torch.allclose(timm_pixel_values, pixel_values) - - # verify logits - with torch.no_grad(): - outputs = model(pixel_values) - logits = outputs.logits - - print("Logits:", logits[0, :3]) - print("Predicted class:", model.config.id2label[logits.argmax(-1).item()]) - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model {model_name} and processor to the hub") - model.push_to_hub(f"ybelkada/{model_name}") - processor.push_to_hub(f"ybelkada/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="resnetv2_50x1_bitm", - type=str, - help="Name of the BiT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub.", - ) - - args = parser.parse_args() - convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index d8ce9b056c..0000000000 --- a/src/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,114 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Blenderbot checkpoint.""" - -import argparse - -import torch - -from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -PATTERNS = [ - ["attention", "attn"], - ["encoder_attention", "encoder_attn"], - ["q_lin", "q_proj"], - ["k_lin", "k_proj"], - ["v_lin", "v_proj"], - ["out_lin", "out_proj"], - ["norm_embeddings", "layernorm_embedding"], - ["position_embeddings", "embed_positions"], - ["embeddings", "embed_tokens"], - ["ffn.lin", "fc"], -] - - -def rename_state_dict_key(k): - if k == "embeddings.weight": - return "shared.weight" - - for parlai_name, hf_name in PATTERNS: - k = k.replace(parlai_name, hf_name) - - if k.startswith("encoder"): - k = k.replace(".attn", ".self_attn") - k = k.replace("norm1", "self_attn_layer_norm") - k = k.replace("norm2", "final_layer_norm") - elif k.startswith("decoder"): - k = k.replace("norm1", "self_attn_layer_norm") - k = k.replace("norm2", "encoder_attn_layer_norm") - k = k.replace("norm3", "final_layer_norm") - return k - - -def rename_layernorm_keys(sd): - keys = [ - "model.encoder.layernorm_embedding.weight", - "model.encoder.layernorm_embedding.bias", - "model.decoder.layernorm_embedding.weight", - "model.decoder.layernorm_embedding.bias", - ] - for k in keys: - v = sd.pop(k) - new_k = k.replace("layernorm_embedding", "layer_norm") - assert new_k not in sd - sd[new_k] = v - - -IGNORE_KEYS = ["START"] - - -@torch.no_grad() -def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path): - """ - Copy/paste/tweak model's weights to our BERT structure. - """ - model = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - sd = model["model"] - cfg = BlenderbotConfig.from_json_file(config_json_path) - m = BlenderbotForConditionalGeneration(cfg) - valid_keys = m.model.state_dict().keys() - failures = [] - mapping = {} - for k, v in sd.items(): - if k in IGNORE_KEYS: - continue - - new_k = rename_state_dict_key(k) - if new_k not in valid_keys: - failures.append([k, new_k]) - else: - mapping[new_k] = v - if cfg.normalize_before: # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm - rename_layernorm_keys(sd) - m.model.load_state_dict(mapping, strict=True) - m.half() - m.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin") - parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.") - parser.add_argument( - "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use" - ) - args = parser.parse_args() - convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json) diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py deleted file mode 100644 index 3de18c294a..0000000000 --- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import re - -import requests -import torch - -# git clone https://github.com/salesforce/BLIP.git -from models.blip import blip_decoder -from models.blip_itm import blip_itm -from models.blip_vqa import blip_vqa -from PIL import Image -from torchvision import transforms -from torchvision.transforms.functional import InterpolationMode - -from transformers import ( - BertTokenizer, - BlipConfig, - BlipForConditionalGeneration, - BlipForImageTextRetrieval, - BlipForQuestionAnswering, -) - - -def load_demo_image(image_size, device): - img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" - raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") - - transform = transforms.Compose( - [ - transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) - image = transform(raw_image).unsqueeze(0).to(device) - return image - - -def rename_key(key): - if "visual_encoder" in key: - key = re.sub("visual_encoder*", "vision_model.encoder", key) - if "blocks" in key: - key = re.sub(r"blocks", "layers", key) - if "attn" in key: - key = re.sub(r"attn", "self_attn", key) - if "norm1" in key: - key = re.sub(r"norm1", "layer_norm1", key) - if "norm2" in key: - key = re.sub(r"norm2", "layer_norm2", key) - if "encoder.norm" in key: - key = re.sub(r"encoder.norm", "post_layernorm", key) - if "encoder.patch_embed.proj" in key: - key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key) - - if "encoder.pos_embed" in key: - key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key) - if "encoder.cls_token" in key: - key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key) - - if "self_attn" in key: - key = re.sub(r"self_attn.proj", "self_attn.projection", key) - - return key - - -@torch.no_grad() -def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = BlipConfig.from_pretrained(config_path) - else: - config = BlipConfig(projection_dim=512, text_config={}, vision_config={}) - - hf_model = BlipForConditionalGeneration(config).eval() - - model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" - - pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base") - pt_model = pt_model.eval() - - modified_state_dict = pt_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_model.load_state_dict(modified_state_dict) - - image_size = 384 - image = load_demo_image(image_size=image_size, device="cpu") - tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - input_ids = tokenizer(["a picture of"]).input_ids - - out = hf_model.generate(image, input_ids) - - assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] - - out = hf_model.generate(image) - - assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] - - if pytorch_dump_folder_path is not None: - hf_model.save_pretrained(pytorch_dump_folder_path) - - # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth' - model_url = ( - "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" - ) - - vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base") - vqa_model.eval() - - modified_state_dict = vqa_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_vqa_model = BlipForQuestionAnswering(config) - - hf_vqa_model.load_state_dict(modified_state_dict) - - question = ["How many dogs are in this image?"] - question_input_ids = tokenizer(question, return_tensors="pt").input_ids - - answer = hf_vqa_model.generate(question_input_ids, image) - print(tokenizer.decode(answer[0])) - - assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]" - if pytorch_dump_folder_path is not None: - hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa") - - model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" - - itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base") - itm_model.eval() - - modified_state_dict = itm_model.state_dict() - for key in modified_state_dict.copy(): - value = modified_state_dict.pop(key) - renamed_key = rename_key(key) - modified_state_dict[renamed_key] = value - - hf_itm_model = BlipForImageTextRetrieval(config) - - question = ["A picture of a woman with a dog sitting in a beach"] - question_input_ids = tokenizer( - question, - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=35, - ).input_ids - - hf_itm_model.load_state_dict(modified_state_dict) - hf_itm_model.eval() - - out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True) - out = hf_itm_model(question_input_ids, image, use_itm_head=False) - - assert out[0].item() == 0.2110687494277954 - assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127 - - if pytorch_dump_folder_path is not None: - hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py deleted file mode 100644 index d6640045b8..0000000000 --- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py +++ /dev/null @@ -1,390 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert BLIP-2 checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2 -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2_float32 -# to make sure we can compare both original and HF implementation in float32 -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BertTokenizer, - Blip2Config, - Blip2ForConditionalGeneration, - Blip2ForImageTextRetrieval, - Blip2Processor, - Blip2QFormerConfig, - Blip2VisionConfig, - BlipImageProcessor, - OPTConfig, - T5Config, - set_seed, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, model_name): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias")) - if "itm" in model_name: - rename_keys.append(("Qformer.bert.embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight")) - rename_keys.append(("Qformer.bert.embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight")) - rename_keys.append(("vision_proj.weight", "vision_projection.weight")) - rename_keys.append(("vision_proj.bias", "vision_projection.bias")) - rename_keys.append(("text_proj.weight", "text_projection.weight")) - rename_keys.append(("text_proj.bias", "text_projection.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name, eos_token_id): - image_size = 364 if "coco" in model_name else 224 - vision_config = Blip2VisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "opt-2.7b" in model_name: - text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict() - elif "opt-6.7b" in model_name: - text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict() - elif "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "itm" in model_name: - text_config = {} - else: - raise ValueError("Model name not supported") - - if "itm" in model_name: - config = Blip2Config( - vision_config=vision_config, - qformer_config=Blip2QFormerConfig(vocab_size=30523, use_qformer_text_input=True).to_dict(), - ) - else: - config = Blip2Config(vision_config=vision_config, text_config=text_config) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint( - model_name, pytorch_dump_folder_path=None, push_to_hub=False, lavis_device="cpu", hf_model_device="cpu" -): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - if "opt" in model_name: - tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b") - elif "itm" in model_name: - tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right") - tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - else: - tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl") - - if "itm" in model_name: - eos_token_id = None - else: - eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0] - config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id) - - if "itm" in model_name: - hf_model = Blip2ForImageTextRetrieval(config).eval() - else: - hf_model = Blip2ForConditionalGeneration(config).eval() - - model_name_to_original = { - "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"), - "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"), - "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"), - "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"), - "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"), - "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"), - "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"), - "blip2-itm-vit-g": ("blip2_image_text_matching", "pretrain"), - "blip2-itm-vit-g-coco": ("blip2_image_text_matching", "coco"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config, model_name) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "opt_proj" in key: - key = key.replace("opt_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("opt"): - key = key.replace("opt", "language") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) - assert len(missing_keys) == 0 - - if "itm" in model_name: - unexpected_keys = list(filter(lambda x: not x.startswith("Qformer.cls"), unexpected_keys)) - assert unexpected_keys == ["temp", "qformer.embeddings.position_ids"] - else: - assert unexpected_keys == ["qformer.embeddings.position_ids"] - - image = load_demo_image() - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer) - pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device) - - # make sure processor creates exact same pixel values - assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device)) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - - if "itm" in model_name: - caption = "a large fountain spewing water into the air" - input_ids = tokenizer([caption], return_tensors="pt").input_ids.to(hf_model_device) - attention_mask = processor(text=caption, return_tensors="pt").attention_mask.to(hf_model_device) - - with torch.no_grad(): - original_logits = original_model( - {"image": original_pixel_values, "text_input": [caption]}, match_head="itm" - ) - logits = hf_model( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - use_image_text_matching_head=True, - ) - - assert original_logits.shape == logits.logits_per_image.shape - print("First values of original logits:", original_logits[0, :3]) - print("First values of HF logits:", logits.logits_per_image[0, :3]) - - # assert values - # cast to same type - target_dtype = logits.logits_per_image.dtype - assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) - - original_itm_scores = torch.nn.functional.softmax(original_logits, dim=1) - itm_scores = torch.nn.functional.softmax(logits.logits_per_image, dim=1) - assert torch.allclose(original_itm_scores.to(target_dtype), itm_scores, atol=1e-4) - print("Looks ok!") - - with torch.no_grad(): - original_logits = original_model( - {"image": original_pixel_values, "text_input": [caption]}, match_head="itc" - ) - logits = hf_model( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - use_image_text_matching_head=False, - ) - - assert original_logits.shape == logits.logits_per_image.shape - print("First values of original logits:", original_logits[0, :3]) - print("First values of HF logits:", logits.logits_per_image[0, :3]) - - # assert values - # cast to same type - target_dtype = logits.logits_per_image.dtype - assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) - print("Looks ok!") - - else: - input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device) - - with torch.no_grad(): - if "opt" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits - logits = hf_model(pixel_values, input_ids).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]} - ).logits - labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(pixel_values, input_ids, labels=labels).logits - - assert original_logits.shape == logits.shape - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4) - print("Looks ok!") - - print("Generating a caption...") - prompt = "Question: what object is in this image? Answer:" - input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device) - - set_seed(42) - - original_outputs = original_model.generate( - {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True, max_length=50 - ) - outputs = hf_model.generate( - pixel_values, - input_ids, - do_sample=True, - num_beams=5, - max_length=30, - min_length=1, - top_p=0.9, - repetition_penalty=1.0, - length_penalty=1.0, - temperature=1, - ) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("Original generation:", original_outputs) - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"nielsr/{model_name}") - hf_model.push_to_hub(f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "blip2-opt-2.7b", - "blip2-opt-6.7b", - "blip2-opt-2.7b-coco", - "blip2-opt-6.7b-coco", - "blip2-flan-t5-xl", - "blip2-flan-t5-xl-coco", - "blip2-flan-t5-xxl", - "blip2-itm-vit-g", - "blip2-itm-vit-g-coco", - ] - parser.add_argument( - "--model_name", - default="blip2-opt-2.7b", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - # note: this script is tested on 2 GPUs, as models are compared in float32, - # which requires quite some memory. Hence loading both on a - # separate device is the easiest to compare - parser.add_argument( - "--lavis_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." - ) - parser.add_argument( - "--hf_model_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." - ) - - args = parser.parse_args() - - convert_blip2_checkpoint( - args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.lavis_device, args.hf_model_device - ) diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py deleted file mode 100644 index c4aa6f27c9..0000000000 --- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,254 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert BigScience BLOOM checkpoint.""" - -import argparse -import json -import os -import re - -import torch - -from transformers import BloomConfig, BloomModel -from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME -from transformers.utils import logging - - -logging.set_verbosity_info() - -WEIGHTS_TO_AVERAGE_ENDSWITH = [ - "word_embeddings_layernorm.weight", - "word_embeddings_layernorm.bias", - "input_layernorm.weight", - "input_layernorm.bias", - "post_attention_layernorm.weight", - "post_attention_layernorm.bias", - "self_attention.dense.bias", - "mlp.dense_4h_to_h.bias", - "ln_f.weight", - "ln_f.bias", -] - -WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [ - "mlp.dense_4h_to_h.weight", - "self_attention.dense.weight", -] - - -def layer_name_mapping(key, file): - """Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only""" - # Handle first and last layers - layer_rename_map = { - "word_embeddings.weight": "word_embeddings.weight", - "word_embeddings.norm.weight": "word_embeddings_layernorm.weight", - "word_embeddings.norm.bias": "word_embeddings_layernorm.bias", - "weight": "ln_f.weight", - "bias": "ln_f.bias", - } - - if key in layer_rename_map: - return layer_rename_map[key] - - # Handle transformer blocks - layer_number = int(re.match(r".*layer_(\d*).*", file)[1]) - layer_number -= 3 - return f"h.{layer_number}." + key - - -def get_dtype_size(dtype): - if dtype == torch.bool: - return 1 / 8 - bit_search = re.search(r"[^\d](\d+)$", str(dtype)) - if bit_search is None: - raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") - bit_size = int(bit_search.groups()[0]) - return bit_size // 8 - - -def convert_bloom_checkpoint_to_pytorch( - bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp -): - # Construct model - if bloom_config_file == "": - config = BloomConfig() - else: - config = BloomConfig.from_json_file(bloom_config_file) - - if shard_model: - file_names = os.listdir(bloom_checkpoint_path) - file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) - - index_dict = {"weight_map": {}, "metadata": {}} - total_size = 0 - - missing_keys = None - - config = BloomConfig() - - for j, file in enumerate(file_names): - print("Processing file: {}".format(file)) - tensors = None - - for i in range(pretraining_tp): - # load all TP files - f_name = file.replace("model_00", f"model_0{i}") - temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu", weights_only=True) - - # Rename keys in the transformers names - keys = list(temp.keys()) - for key in keys: - temp[layer_name_mapping(key, file)] = temp.pop(key) - - if tensors is None: - tensors = temp - else: - for key in tensors.keys(): - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) - tensors[key] += temp[key] - else: - # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel - cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights across TP ranks - tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) - - # Divide by the number of TP the weights we want to average - for key in tensors.keys(): - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] = tensors[key] / pretraining_tp - torch.save( - tensors, - os.path.join( - pytorch_dump_folder_path, - "pytorch_model_{}-of-{}.bin".format(str(j + 1).zfill(5), str(len(file_names)).zfill(5)), - ), - ) - - for key in tensors.keys(): - value = tensors[key] - total_size += value.numel() * get_dtype_size(value.dtype) - if key not in index_dict["weight_map"]: - index_dict["weight_map"][key] = "pytorch_model_{}-of-{}.bin".format( - str(j + 1).zfill(5), str(len(file_names)).zfill(5) - ) - - config = BloomConfig() - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - index_dict["metadata"]["total_size"] = total_size - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - with open(os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME + ".index.json"), "w", encoding="utf-8") as f: - json_config = json.dumps(index_dict, indent=2, sort_keys=True) + "\n" - f.write(json_config) - else: - model = BloomModel(config) - - file_names = os.listdir(bloom_checkpoint_path) - file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) - - missing_keys = None - for i, file in enumerate(file_names): - tensors = None - for i in range(pretraining_tp): - # load all TP files - f_name = file.replace("model_00", f"model_0{i}") - temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu", weights_only=True) - - # Rename keys in the transformers names - keys = list(temp.keys()) - for key in keys: - temp[layer_name_mapping(key, file)] = temp.pop(key) - - if tensors is None: - tensors = temp - else: - for key in tensors.keys(): - # We average (sum and then divide) some weights across TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] += temp[key] - else: - # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel - cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - # We concatenate these weights across TP ranks - tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) - - # Divide by the number of TP the weights we want to average - for key in tensors.keys(): - if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): - tensors[key] = tensors[key] / pretraining_tp - - other_keys = model.load_state_dict(tensors, strict=False) - assert not other_keys.unexpected_keys, f"The keys {other_keys.unexpected_keys} are unexpected" - if missing_keys is None: - missing_keys = set(other_keys.missing_keys) - else: - missing_keys = missing_keys.intersection(set(other_keys.missing_keys)) - - assert not missing_keys, f"The keys {missing_keys} are missing" - - # Save pytorch-model - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path} with dtype {config.torch_dtype}") - if config.torch_dtype is not None: - model = model.to(config.torch_dtype) - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--bloom_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the Megatron-LM checkpoint path.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--bloom_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--shard_model", - action="store_true", - help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint", - ) - parser.add_argument( - "--pretraining_tp", - default=4, - type=int, - help="Pretraining TP rank that has been used when training the model in Megatron-LM \n", - ) - args = parser.parse_args() - convert_bloom_checkpoint_to_pytorch( - args.bloom_checkpoint_path, - args.bloom_config_file, - args.pytorch_dump_folder_path, - args.shard_model, - args.pretraining_tp, - ) diff --git a/src/transformers/models/bros/convert_bros_to_pytorch.py b/src/transformers/models/bros/convert_bros_to_pytorch.py deleted file mode 100644 index c0984f2c74..0000000000 --- a/src/transformers/models/bros/convert_bros_to_pytorch.py +++ /dev/null @@ -1,145 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Bros checkpoints.""" - -import argparse - -import bros # original repo -import torch - -from transformers import BrosConfig, BrosModel, BrosProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_configs(model_name): - bros_config = BrosConfig.from_pretrained(model_name) - return bros_config - - -def remove_ignore_keys_(state_dict): - ignore_keys = [ - "embeddings.bbox_sinusoid_emb.inv_freq", - ] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if name == "embeddings.bbox_projection.weight": - name = "bbox_embeddings.bbox_projection.weight" - - if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq": - name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq" - - if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq": - name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq" - - return name - - -def convert_state_dict(orig_state_dict, model): - # rename keys - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - orig_state_dict[rename_key(key)] = val - - # remove ignore keys - remove_ignore_keys_(orig_state_dict) - - return orig_state_dict - - -def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - # load original model - original_model = bros.BrosModel.from_pretrained(model_name).eval() - - # load HuggingFace Model - bros_config = get_configs(model_name) - model = BrosModel.from_pretrained(model_name, config=bros_config) - model.eval() - - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict(state_dict, model) - model.load_state_dict(new_state_dict) - - # verify results - - # original BROS model require 4 points (8 float values) for each bbox, prepare bbox with [batch_size, seq_len, 8] shape - bbox = torch.tensor( - [ - [ - [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], - [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850], - [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], - [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], - [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], - [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], - [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000], - ] - ] - ) - - processor = BrosProcessor.from_pretrained(model_name) - - encoding = processor("His name is Rocco.", return_tensors="pt") - encoding["bbox"] = bbox - - original_hidden_states = original_model(**encoding).last_hidden_state - # pixel_values = processor(image, return_tensors="pt").pixel_values - - last_hidden_states = model(**encoding).last_hidden_state - - assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4) - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") - processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Required parameters - parser.add_argument( - "--model_name", - default="jinho8345/bros-base-uncased", - required=False, - type=str, - help="Name of the original model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - required=False, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the đŸ€— hub.", - ) - - args = parser.parse_args() - convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 9b1b15857c..0000000000 --- a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The T5 authors and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert T5 checkpoint.""" - -import argparse - -from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5 -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config = T5Config.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - model = T5ForConditionalGeneration(config) - - # Load weights from tf checkpoint - load_tf_weights_in_t5(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 45dcdb2903..0000000000 --- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,65 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert CANINE checkpoint.""" - -import argparse - -from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path): - # Initialize PyTorch model - config = CanineConfig() - model = CanineModel(config) - model.eval() - - print(f"Building PyTorch model from configuration: {config}") - - # Load weights from tf checkpoint - load_tf_weights_in_canine(model, config, tf_checkpoint_path) - - # Save pytorch-model (weights and configuration) - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - # Save tokenizer files - tokenizer = CanineTokenizer() - print(f"Save tokenizer files to {pytorch_dump_path}") - tokenizer.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the TensorFlow checkpoint. Should end with model.ckpt", - ) - parser.add_argument( - "--pytorch_dump_path", - default=None, - type=str, - required=True, - help="Path to a folder where the PyTorch model will be placed.", - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path) diff --git a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py deleted file mode 100644 index 59b253b5ec..0000000000 --- a/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py +++ /dev/null @@ -1,478 +0,0 @@ -# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import json -import os - -import requests -import torch -import yaml -from accelerate import init_empty_weights -from PIL import Image - -from transformers import ( - ChameleonConfig, - ChameleonForConditionalGeneration, - ChameleonImageProcessor, - ChameleonProcessor, -) - - -try: - from transformers import LlamaTokenizerFast -except ImportError: - raise ValueError( - "Chameleon conversion supports only FastTokenizer and LlamaTokenizerFast can't be imported! " - "Update your `tokenizers` library and re-run the tokenizer conversion." - ) - -""" -Sample usage: - -``` -python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \ - --input_dir /path/to/downloaded/chameleon/weights --model_size 7B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast - -model = ChameleonForConditionalGeneration.from_pretrained("/output/path") -tokenizer = LlamaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -NUM_SHARDS = { - "7B": 1, - "30B": 4, -} - -VOCAB_SIZE = 65536 - - -def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): - return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) - - -def read_json(path): - with open(path, "r") as f: - return json.load(f) - - -def write_json(text, path): - with open(path, "w") as f: - json.dump(text, f) - - -def write_model(model_path, input_base_path, model_size, chameleon_version=1): - os.makedirs(model_path, exist_ok=True) - input_model_path = os.path.join(input_base_path, "models", model_size.lower()) - params_path = os.path.join(input_model_path, "params.json") - consolidate_params_path = os.path.join(input_model_path, "consolidate_params.json") - - params = read_json(params_path) - if os.path.isfile(consolidate_params_path): - params = {**params, **read_json(consolidate_params_path)} - num_shards = NUM_SHARDS[model_size] - model_parallel_size = params["model_parallel_size"] - params = params.get("model", params) - n_layers = params["n_layers"] - n_heads = params["n_heads"] - n_heads_per_shard = n_heads // num_shards - dim = params["dim"] - dims_per_head = dim // n_heads - base = params.get("rope_theta", 10000.0) - swin_norm = params["swin_norm"] - if base > 10000.0: - max_position_embeddings = 16384 - else: - # Depending on the Chameleon version, the default max_position_embeddings has different values. - if chameleon_version == 1: - max_position_embeddings = 4096 - else: - raise NotImplementedError( - f"Version {chameleon_version} of chameleon is not supported yet. " - "Current supported versions of chameleon are [1]." - ) - - if params.get("n_kv_heads", None) is not None: - num_key_value_heads = params["n_kv_heads"] # for GQA / MQA - num_local_key_value_heads = n_heads_per_shard // num_key_value_heads - key_value_dim = dim // num_key_value_heads - else: # compatibility with other checkpoints - num_key_value_heads = n_heads - num_local_key_value_heads = n_heads_per_shard - key_value_dim = dim - - print(f"Fetching all parameters from the checkpoint at {input_model_path}.") - # Load weights - if num_shards == 1: - # Not sharded - # (The sharded implementation would also work, but this is simpler.) - loaded = None - for possible_name in ["consolidated.pth", "consolidated.00.pth"]: - possible_path = os.path.join(input_model_path, possible_name) - if os.path.exists(possible_path): - loaded = torch.load(possible_path, map_location="cpu", weights_only=True) - break - assert loaded is not None - else: - # Sharded - loaded = [ - torch.load( - os.path.join(input_model_path, f"consolidated.{i:02d}.pth"), map_location="cpu", weights_only=True - ) - for i in range(num_shards) - ] - - # permute for sliced rotary - def permute(w, n_heads, dim1=dim, dim2=dim): - return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) - - # Load weights to the state dict - state_dict = {} - for layer_i in range(n_layers): - if num_shards == 1: - # Unsharded - state_dict.update( - { - f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads - ), - f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( - loaded[f"layers.{layer_i}.attention.wk.weight"], - n_heads=num_key_value_heads, - dim1=key_value_dim, - ), - f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], - f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], - f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], - f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], - f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], - f"model.layers.{layer_i}.input_layernorm.weight": loaded[ - f"layers.{layer_i}.attention_norm.weight" - ], - f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ - f"layers.{layer_i}.ffn_norm.weight" - ], - } - ) - # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( - loaded[f"layers.{layer_i}.attention.q_normalization.weight"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(n_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( - loaded[f"layers.{layer_i}.attention.q_normalization.bias"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(n_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( - loaded[f"layers.{layer_i}.attention.k_normalization.weight"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(num_key_value_heads, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( - loaded[f"layers.{layer_i}.attention.k_normalization.bias"] - .view(dims_per_head // 2, 2) - .t() - .reshape(1, -1) - .repeat_interleave(num_key_value_heads, 0) - ) - - else: - # Sharded - state_dict.update( - { - f"model.layers.{layer_i}.input_layernorm.weight": torch.stack( - [l[f"layers.{layer_i}.attention_norm.weight"] for l in loaded] - ).mean(dim=0), - f"model.layers.{layer_i}.post_attention_layernorm.weight": torch.stack( - [l[f"layers.{layer_i}.ffn_norm.weight"] for l in loaded] - ).mean(dim=0), - } - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) - for i in range(num_shards) - ], - dim=0, - ).reshape(dim, dim), - n_heads=n_heads, - ) - - state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( - torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( - num_local_key_value_heads, dims_per_head, dim - ) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim), - n_heads=num_key_value_heads, - dim1=key_value_dim, - ) - - # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( - torch.cat([l[f"layers.{layer_i}.attention.q_normalization.weight"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(n_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( - torch.cat([l[f"layers.{layer_i}.attention.q_normalization.bias"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(n_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( - torch.cat([l[f"layers.{layer_i}.attention.k_normalization.weight"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(num_key_value_heads // num_shards, 0) - ) - state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( - torch.cat([l[f"layers.{layer_i}.attention.k_normalization.bias"].unsqueeze(0) for l in loaded]) - .view(num_shards, dims_per_head // 2, 2) - .transpose(1, 2) - .reshape(num_shards, -1) - .repeat_interleave(num_key_value_heads // num_shards, 0) - ) - - state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( - [ - loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( - num_local_key_value_heads, dims_per_head, dim - ) - for i in range(num_shards) - ], - dim=0, - ).reshape(key_value_dim, dim) - - state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 - ) - state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 - ) - state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( - [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 - ) - - if num_shards == 1: - # Unsharded - state_dict.update( - { - "model.embed_tokens.weight": loaded["tok_embeddings.weight"], - "model.norm.weight": loaded["norm.weight"], - "lm_head.weight": loaded["output.weight"], - } - ) - else: - state_dict.update( - { - "model.embed_tokens.weight": torch.cat( - [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 - ), - "model.norm.weight": torch.stack([loaded[i]["norm.weight"] for i in range(num_shards)]).mean(dim=0), - "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), - } - ) - - # Load VQGAN weights - vqgan_path = os.path.join(input_base_path, "tokenizer/vqgan.ckpt") - vqgan_state_dict = torch.load(vqgan_path, map_location="cpu", weights_only=True)["state_dict"] - for k, v in vqgan_state_dict.items(): - if "decoder" in k: - continue # we dont do image generation yet - state_dict[f"model.vqmodel.{k}"] = v - - # Write configs - ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 - multiple_of = params["multiple_of"] if "multiple_of" in params else 256 - - with open(os.path.join(input_base_path, "tokenizer/text_tokenizer.json")) as tokenizer_file: - tokenizer_config = json.load(tokenizer_file) - vocabulary_map = tokenizer_config["model"]["vocab"] - vocabulary_map[""] = vocabulary_map[ - "" - ] # use a reserved token instead of adding a new one - del vocabulary_map[""] - - for token in tokenizer_config["added_tokens"]: - if token["content"] == "": - token["content"] = "" - - with open(os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), "w") as f: - json.dump(tokenizer_config, f) # save the new file to init tokenizer later - - vq_keys_to_replace = [ - ("ch", "base_channels"), - ("out_ch", "out_channels"), - ("n_embed", "num_embeddings"), - ("ch_mult", "channel_multiplier"), - ("double_z", "double_latent"), - ("z_channels", "latent_channels"), - ] - with open(os.path.join(input_base_path, "tokenizer/vqgan.yaml")) as vqgan_cfg_file: - vq_config = yaml.safe_load(vqgan_cfg_file)["model"]["params"] - vq_config.update(**vq_config["ddconfig"]) - for old, new in vq_keys_to_replace: - vq_config[new] = vq_config[old] - del vq_config["ddconfig"] - del vq_config["ckpt_path"] - del vq_config["lossconfig"] - - config = ChameleonConfig( - hidden_size=dim, - intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), - num_attention_heads=params["n_heads"], - num_hidden_layers=params["n_layers"], - rms_norm_eps=params["norm_eps"], - num_key_value_heads=num_key_value_heads, - vocab_size=VOCAB_SIZE, - rope_theta=base, - max_position_embeddings=max_position_embeddings, - model_parallel_size=model_parallel_size, - swin_norm=swin_norm, - vq_config=vq_config, - vocabulary_map=vocabulary_map, - ) - with init_empty_weights(): - model = ChameleonForConditionalGeneration(config) - - model.load_state_dict(state_dict, assign=True, strict=False) - model.save_pretrained(model_path, safe_serialization=True) - - # Load and save the processor - tokenizer = LlamaTokenizerFast( - tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False - ) - tokenizer.sep_token_id = 8710 # assign to sep so that we can append it after input text - tokenizer.pad_token_id = 1 # assign to special pad_token - image_processor = ChameleonImageProcessor() - processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer) - processor.save_pretrained(model_path) - - # Make space so we can load the model properly now. - del state_dict - del loaded - del vqgan_state_dict - gc.collect() - - # Short inference on a few examples to check if generation makes sense - # taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl - print("Loading the checkpoint in a Chameleon model...") - print("*" * 100) - model = ChameleonForConditionalGeneration.from_pretrained( - model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto" - ) - processor = ChameleonProcessor.from_pretrained(model_path) - - prompt = "I'm very intrigued by this work of art:Please tell me about the artist." - image = Image.open( - requests.get( - "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True - ).raw - ) - inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16) - length = inputs.input_ids.shape[1] - - out = model.generate(**inputs, max_new_tokens=40, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for single-image: {generated_text}") - print("*" * 100) - - # Multi-image example - prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation." - image = Image.open( - requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw - ) - image_2 = Image.open( - requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw - ) - - inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, dtype=torch.bfloat16) - length = inputs.input_ids.shape[1] - out = model.generate(**inputs, max_new_tokens=50, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for multi-image: {generated_text}") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Chameleon weights", - ) - parser.add_argument( - "--model_size", - choices=["7B", "30B"], - help="" - " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, checkout the original repo: https://github.com/facebookresearch/chameleon", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model", - ) - parser.add_argument( - "--test_inference", - action="store_true", - help="Whether to load the model for generation to test it's converted correctly.", - ) - # Different Chameleon versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. - parser.add_argument( - "--chameleon_version", - choices=[1], - default=1, - type=int, - help="Version of the Chameleon model to convert", - ) - args = parser.parse_args() - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - model_size=args.model_size, - chameleon_version=args.chameleon_version, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py deleted file mode 100644 index adc9300ef5..0000000000 --- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch - -from transformers import ChineseCLIPConfig, ChineseCLIPModel - - -def copy_attn_layer(hf_attn_layer, pt_weights, prefix): - q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0) - q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0) - - out_proj_weights = pt_weights[f"{prefix}.out_proj.weight"] - out_proj_bias = pt_weights[f"{prefix}.out_proj.bias"] - - hf_attn_layer.q_proj.weight.data = q_proj - hf_attn_layer.q_proj.bias.data = q_proj_bias - - hf_attn_layer.k_proj.weight.data = k_proj - hf_attn_layer.k_proj.bias.data = k_proj_bias - - hf_attn_layer.v_proj.weight.data = v_proj - hf_attn_layer.v_proj.bias.data = v_proj_bias - - hf_attn_layer.out_proj.weight.data = out_proj_weights - hf_attn_layer.out_proj.bias.data = out_proj_bias - - -def copy_mlp(hf_mlp, pt_weights, prefix): - copy_linear(hf_mlp.fc1, pt_weights, f"{prefix}.c_fc") - copy_linear(hf_mlp.fc2, pt_weights, f"{prefix}.c_proj") - - -def copy_linear(hf_linear, pt_weights, prefix): - hf_linear.weight.data = pt_weights[f"{prefix}.weight"].data - hf_linear.bias.data = pt_weights[f"{prefix}.bias"].data - - -def copy_layer(hf_layer, pt_weights, prefix): - # copy layer norms - copy_linear(hf_layer.layer_norm1, pt_weights, f"{prefix}.ln_1") - copy_linear(hf_layer.layer_norm2, pt_weights, f"{prefix}.ln_2") - - # copy MLP - copy_mlp(hf_layer.mlp, pt_weights, f"{prefix}.mlp") - - # copy attn - copy_attn_layer(hf_layer.self_attn, pt_weights, f"{prefix}.attn") - - -def copy_layers(hf_layers, pt_weights, prefix): - for layer_id, hf_layer in enumerate(hf_layers): - copy_layer(hf_layer, pt_weights, f"{prefix}.{layer_id}") - - -def copy_text_model_and_projection(hf_model, pt_weights): - # copy projection - hf_model.text_projection.weight.data = pt_weights["text_projection"].data.T - - # copy text encoder - for name, param in hf_model.text_model.named_parameters(): - param.data = pt_weights[f"bert.{name}"].data - - -def copy_vision_model_and_projection(hf_model, pt_weights): - # copy projection - hf_model.visual_projection.weight.data = pt_weights["visual.proj"].data.T - - # copy layer norms - copy_linear(hf_model.vision_model.pre_layrnorm, pt_weights, "visual.ln_pre") - copy_linear(hf_model.vision_model.post_layernorm, pt_weights, "visual.ln_post") - - # copy embeddings - hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_weights["visual.conv1.weight"].data - hf_model.vision_model.embeddings.class_embedding.data = pt_weights["visual.class_embedding"].data - hf_model.vision_model.embeddings.position_embedding.weight.data = pt_weights["visual.positional_embedding"].data - - # copy encoder - copy_layers(hf_model.vision_model.encoder.layers, pt_weights, "visual.transformer.resblocks") - - -@torch.no_grad() -def convert_chinese_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - - assert config_path is not None, "Please specify the ChineseCLIP model config of the corresponding model size." - config = ChineseCLIPConfig.from_pretrained(config_path) - - hf_model = ChineseCLIPModel(config).eval() - - pt_weights = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["state_dict"] - pt_weights = {(name[7:] if name.startswith("module.") else name): value for name, value in pt_weights.items()} - - copy_text_model_and_projection(hf_model, pt_weights) - copy_vision_model_and_projection(hf_model, pt_weights) - hf_model.logit_scale.data = pt_weights["logit_scale"].data - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output folder storing converted hf PyTorch model.", - ) - parser.add_argument( - "--checkpoint_path", default=None, type=str, help="Path to original github format ChineseCLIP checkpoint." - ) - parser.add_argument( - "--config_path", default=None, required=True, type=str, help="Path to hf config.json of model to convert." - ) - args = parser.parse_args() - - convert_chinese_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) - print("The conversion is finished!") diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py deleted file mode 100644 index 66488e401a..0000000000 --- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py +++ /dev/null @@ -1,133 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import re - -from laion_clap import CLAP_Module - -from transformers import AutoFeatureExtractor, ClapConfig, ClapModel - - -KEYS_TO_MODIFY_MAPPING = { - "text_branch": "text_model", - "audio_branch": "audio_model.audio_encoder", - "attn": "attention.self", - "self.proj": "output.dense", - "attention.self_mask": "attn_mask", - "mlp.fc1": "intermediate.dense", - "mlp.fc2": "output.dense", - "norm1": "layernorm_before", - "norm2": "layernorm_after", - "bn0": "batch_norm", -} - -processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc") - - -def init_clap(checkpoint_path, model_type, enable_fusion=False): - model = CLAP_Module( - amodel=model_type, - enable_fusion=enable_fusion, - ) - model.load_ckpt(checkpoint_path) - return model - - -def get_config_from_original(clap_model): - audio_config = { - "patch_embeds_hidden_size": clap_model.model.audio_branch.embed_dim, - "depths": clap_model.model.audio_branch.depths, - "hidden_size": clap_model.model.audio_projection[0].in_features, - } - - text_config = {"hidden_size": clap_model.model.text_branch.pooler.dense.in_features} - - return ClapConfig(audio_config=audio_config, text_config=text_config) - - -def rename_state_dict(state_dict): - model_state_dict = {} - - sequential_layers_pattern = r".*sequential.(\d+).*" - text_projection_pattern = r".*_projection.(\d+).*" - - for key, value in state_dict.items(): - # check if any key needs to be modified - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - if re.match(sequential_layers_pattern, key): - # replace sequential layers with list - sequential_layer = re.match(sequential_layers_pattern, key).group(1) - - key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer) // 3}.linear.") - elif re.match(text_projection_pattern, key): - projecton_layer = int(re.match(text_projection_pattern, key).group(1)) - - # Because in CLAP they use `nn.Sequential`... - transformers_projection_layer = 1 if projecton_layer == 0 else 2 - - key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.") - - if "audio" and "qkv" in key: - # split qkv into query key and value - mixed_qkv = value - qkv_dim = mixed_qkv.size(0) // 3 - - query_layer = mixed_qkv[:qkv_dim] - key_layer = mixed_qkv[qkv_dim : qkv_dim * 2] - value_layer = mixed_qkv[qkv_dim * 2 :] - - model_state_dict[key.replace("qkv", "query")] = query_layer - model_state_dict[key.replace("qkv", "key")] = key_layer - model_state_dict[key.replace("qkv", "value")] = value_layer - else: - model_state_dict[key] = value - - return model_state_dict - - -def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, model_type, enable_fusion=False): - clap_model = init_clap(checkpoint_path, model_type, enable_fusion=enable_fusion) - - clap_model.eval() - state_dict = clap_model.model.state_dict() - state_dict = rename_state_dict(state_dict) - - transformers_config = get_config_from_original(clap_model) - transformers_config.audio_config.enable_fusion = enable_fusion - model = ClapModel(transformers_config) - - # ignore the spectrogram embedding layer - model.load_state_dict(state_dict, strict=False) - - model.save_pretrained(pytorch_dump_folder_path) - transformers_config.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not") - parser.add_argument("--model_type", default="HTSAT-tiny", type=str, help="Whether to enable fusion or not") - args = parser.parse_args() - - convert_clap_checkpoint( - args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.model_type, args.enable_fusion - ) diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py deleted file mode 100644 index 3d88fc1929..0000000000 --- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse - -import torch -from clip import load - -from transformers import CLIPConfig, CLIPModel - - -def copy_attn_layer(hf_attn_layer, pt_attn_layer): - q_proj, k_proj, v_proj = pt_attn_layer.in_proj_weight.chunk(3, dim=0) - q_proj_bias, k_proj_bias, v_proj_bias = pt_attn_layer.in_proj_bias.chunk(3, dim=0) - - out_proj_weights = pt_attn_layer.out_proj.weight - out_proj_bias = pt_attn_layer.out_proj.bias - - hf_attn_layer.q_proj.weight.data = q_proj - hf_attn_layer.q_proj.bias.data = q_proj_bias - - hf_attn_layer.k_proj.weight.data = k_proj - hf_attn_layer.k_proj.bias.data = k_proj_bias - - hf_attn_layer.v_proj.weight.data = v_proj - hf_attn_layer.v_proj.bias.data = v_proj_bias - - hf_attn_layer.out_proj.weight = out_proj_weights - hf_attn_layer.out_proj.bias = out_proj_bias - - -def copy_mlp(hf_mlp, pt_mlp): - copy_linear(hf_mlp.fc1, pt_mlp.c_fc) - copy_linear(hf_mlp.fc2, pt_mlp.c_proj) - - -def copy_linear(hf_linear, pt_linear): - hf_linear.weight = pt_linear.weight - hf_linear.bias = pt_linear.bias - - -def copy_layer(hf_layer, pt_layer): - # copy layer norms - copy_linear(hf_layer.layer_norm1, pt_layer.ln_1) - copy_linear(hf_layer.layer_norm2, pt_layer.ln_2) - - # copy MLP - copy_mlp(hf_layer.mlp, pt_layer.mlp) - - # copy attn - copy_attn_layer(hf_layer.self_attn, pt_layer.attn) - - -def copy_layers(hf_layers, pt_layers): - for hf_layer, pt_layer in zip(hf_layers, pt_layers): - copy_layer(hf_layer, pt_layer) - - -def copy_encoder(hf_encoder, pt_model): - # copy embeds - hf_encoder.embeddings.token_embedding.weight = pt_model.token_embedding.weight - hf_encoder.embeddings.position_embedding.weight.data = pt_model.positional_embedding - - # copy layer norm - copy_linear(hf_encoder.final_layer_norm, pt_model.ln_final) - - # copy hidden layers - copy_layers(hf_encoder.encoder.layers, pt_model.transformer.resblocks) - - -def copy_text_model_and_projection(hf_model, pt_model): - # copy projection - hf_model.text_projection.weight.data = pt_model.text_projection.data.T.contiguous() - - # copy text encoder - copy_encoder(hf_model.text_model, pt_model) - - -def copy_vison_model_and_projection(hf_model, pt_model): - # copy projection - hf_model.visual_projection.weight.data = pt_model.visual.proj.data.T.contiguous() - - # copy layer norms - copy_linear(hf_model.vision_model.pre_layrnorm, pt_model.visual.ln_pre) - copy_linear(hf_model.vision_model.post_layernorm, pt_model.visual.ln_post) - - # copy embeds - hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_model.visual.conv1.weight.data - hf_model.vision_model.embeddings.class_embedding = pt_model.visual.class_embedding - hf_model.vision_model.embeddings.position_embedding.weight.data = pt_model.visual.positional_embedding.data - - # copy encoder - copy_layers(hf_model.vision_model.encoder.layers, pt_model.visual.transformer.resblocks) - - -@torch.no_grad() -def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = CLIPConfig.from_pretrained(config_path) - else: - config = CLIPConfig(projection_dim=512, text_config={}, vision_config={}) - - hf_model = CLIPModel(config).eval() - - pt_model, _ = load(checkpoint_path, device="cpu", jit=False) - pt_model = pt_model.eval() - - copy_text_model_and_projection(hf_model, pt_model) - copy_vison_model_and_projection(hf_model, pt_model) - hf_model.logit_scale = pt_model.logit_scale - - # Use `eos_token` so the example is more meaningful - input_ids = torch.tensor( - [ - [config.text_config.bos_token_id] - + list(range(3, 77)) - + [config.text_config.eos_token_id] - + [config.text_config.pad_token_id] - ] - ) - pixel_values = torch.randn(1, 3, 224, 224) - - hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True) - hf_logits_per_image = hf_outputs.logits_per_image - hf_logits_per_text = hf_outputs.logits_per_text - pt_logits_per_image, pt_logits_per_text = pt_model(pixel_values, input_ids) - - assert torch.allclose(hf_logits_per_image, pt_logits_per_image, atol=1e-3) - assert torch.allclose(hf_logits_per_text, pt_logits_per_text, atol=1e-3) - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to OpenAI checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py deleted file mode 100644 index be2cfdee87..0000000000 --- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py +++ /dev/null @@ -1,264 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert CLIPSeg checkpoints from the original repository. URL: https://github.com/timojl/clipseg.""" - -import argparse - -import requests -import torch -from PIL import Image - -from transformers import ( - CLIPSegConfig, - CLIPSegForImageSegmentation, - CLIPSegProcessor, - CLIPSegTextConfig, - CLIPSegVisionConfig, - CLIPTokenizer, - ViTImageProcessor, -) - - -def get_clipseg_config(model_name): - text_config = CLIPSegTextConfig() - vision_config = CLIPSegVisionConfig(patch_size=16) - - use_complex_transposed_convolution = True if "refined" in model_name else False - reduce_dim = 16 if "rd16" in model_name else 64 - - config = CLIPSegConfig.from_text_vision_configs( - text_config, - vision_config, - use_complex_transposed_convolution=use_complex_transposed_convolution, - reduce_dim=reduce_dim, - ) - return config - - -def rename_key(name): - # update prefixes - if "clip_model" in name: - name = name.replace("clip_model", "clip") - if "transformer" in name: - if "visual" in name: - name = name.replace("visual.transformer", "vision_model") - else: - name = name.replace("transformer", "text_model") - if "resblocks" in name: - name = name.replace("resblocks", "encoder.layers") - if "ln_1" in name: - name = name.replace("ln_1", "layer_norm1") - if "ln_2" in name: - name = name.replace("ln_2", "layer_norm2") - if "c_fc" in name: - name = name.replace("c_fc", "fc1") - if "c_proj" in name: - name = name.replace("c_proj", "fc2") - if "attn" in name and "self" not in name: - name = name.replace("attn", "self_attn") - # text encoder - if "token_embedding" in name: - name = name.replace("token_embedding", "text_model.embeddings.token_embedding") - if "positional_embedding" in name and "visual" not in name: - name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight") - if "ln_final" in name: - name = name.replace("ln_final", "text_model.final_layer_norm") - # vision encoder - if "visual.class_embedding" in name: - name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding") - if "visual.conv1" in name: - name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding") - if "visual.positional_embedding" in name: - name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight") - if "visual.ln_pre" in name: - name = name.replace("visual.ln_pre", "vision_model.pre_layrnorm") - if "visual.ln_post" in name: - name = name.replace("visual.ln_post", "vision_model.post_layernorm") - # projection layers - if "visual.proj" in name: - name = name.replace("visual.proj", "visual_projection.weight") - if "text_projection" in name: - name = name.replace("text_projection", "text_projection.weight") - # decoder - if "trans_conv" in name: - name = name.replace("trans_conv", "transposed_convolution") - if "film_mul" in name or "film_add" in name or "reduce" in name or "transposed_convolution" in name: - name = "decoder." + name - if "blocks" in name: - name = name.replace("blocks", "decoder.layers") - if "linear1" in name: - name = name.replace("linear1", "mlp.fc1") - if "linear2" in name: - name = name.replace("linear2", "mlp.fc2") - if "norm1" in name and "layer_" not in name: - name = name.replace("norm1", "layer_norm1") - if "norm2" in name and "layer_" not in name: - name = name.replace("norm2", "layer_norm2") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if key.startswith("clip_model") and "attn.in_proj" in key: - key_split = key.split(".") - if "visual" in key: - layer_num = int(key_split[4]) - dim = config.vision_config.hidden_size - prefix = "vision_model" - else: - layer_num = int(key_split[3]) - dim = config.text_config.hidden_size - prefix = "text_model" - - if "weight" in key: - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[ - dim : dim * 2, : - ] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"clip.{prefix}.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - elif "self_attn" in key and "out_proj" not in key: - key_split = key.split(".") - layer_num = int(key_split[1]) - dim = config.reduce_dim - if "weight" in key: - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"decoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - else: - new_name = rename_key(key) - if "visual_projection" in new_name or "text_projection" in new_name: - val = val.T - orig_state_dict[new_name] = val - - return orig_state_dict - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - return image - - -def convert_clipseg_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub): - config = get_clipseg_config(model_name) - model = CLIPSegForImageSegmentation(config) - model.eval() - - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - - # remove some keys - for key in state_dict.copy().keys(): - if key.startswith("model"): - state_dict.pop(key, None) - - # rename some keys - state_dict = convert_state_dict(state_dict, config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - - if missing_keys != ["clip.text_model.embeddings.position_ids", "clip.vision_model.embeddings.position_ids"]: - raise ValueError("Missing keys that are not expected: {}".format(missing_keys)) - if unexpected_keys != ["decoder.reduce.weight", "decoder.reduce.bias"]: - raise ValueError(f"Unexpected keys: {unexpected_keys}") - - image_processor = ViTImageProcessor(size=352) - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") - processor = CLIPSegProcessor(image_processor=image_processor, tokenizer=tokenizer) - - image = prepare_img() - text = ["a glass", "something to fill", "wood", "a jar"] - - inputs = processor(text=text, images=[image] * len(text), padding="max_length", return_tensors="pt") - - with torch.no_grad(): - outputs = model(**inputs) - - # verify values - expected_conditional = torch.tensor([0.1110, -0.1882, 0.1645]) - expected_pooled_output = torch.tensor([0.2692, -0.7197, -0.1328]) - if model_name == "clipseg-rd64-refined": - expected_masks_slice = torch.tensor( - [[-10.0407, -9.9431, -10.2646], [-9.9751, -9.7064, -9.9586], [-9.6891, -9.5645, -9.9618]] - ) - elif model_name == "clipseg-rd64": - expected_masks_slice = torch.tensor( - [[-7.2877, -7.2711, -7.2463], [-7.2652, -7.2780, -7.2520], [-7.2239, -7.2204, -7.2001]] - ) - elif model_name == "clipseg-rd16": - expected_masks_slice = torch.tensor( - [[-6.3955, -6.4055, -6.4151], [-6.3911, -6.4033, -6.4100], [-6.3474, -6.3702, -6.3762]] - ) - else: - raise ValueError(f"Model name {model_name} not supported.") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_masks_slice, atol=1e-3) - assert torch.allclose(outputs.conditional_embeddings[0, :3], expected_conditional, atol=1e-3) - assert torch.allclose(outputs.pooled_output[0, :3], expected_pooled_output, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor for {model_name} to the hub") - model.push_to_hub(f"CIDAS/{model_name}") - processor.push_to_hub(f"CIDAS/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="clipseg-rd64", - type=str, - choices=["clipseg-rd16", "clipseg-rd64", "clipseg-rd64-refined"], - help=( - "Name of the model. Supported models are: clipseg-rd64, clipseg-rd16 and clipseg-rd64-refined (rd meaning" - " reduce dimension)" - ), - ) - parser.add_argument( - "--checkpoint_path", - default="/Users/nielsrogge/Documents/CLIPSeg/clip_plus_rd64-uni.pth", - type=str, - help=( - "Path to the original checkpoint. Note that the script assumes that the checkpoint includes both CLIP and" - " the decoder weights." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_clipseg_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/clvp/convert_clvp_to_hf.py b/src/transformers/models/clvp/convert_clvp_to_hf.py deleted file mode 100644 index 89babb3c4c..0000000000 --- a/src/transformers/models/clvp/convert_clvp_to_hf.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Weights conversion script for CLVP -""" - -import argparse -import os - -import torch -from huggingface_hub import hf_hub_download - -from transformers import ClvpConfig, ClvpModelForConditionalGeneration - - -_MODELS = { - "clvp": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/clvp2.pth", - "decoder": "https://huggingface.co/jbetker/tortoise-tts-v2/blob/main/.models/autoregressive.pth", -} - -dim = 1024 -sub_dim = dim // 16 - -CLVP_ENCODERS_MAPPING = { - "text_transformer.transformer.attn_layers": "text_encoder_model", - "speech_transformer.transformer.attn_layers": "speech_encoder_model", - "text_transformer.transformer.norm": "text_encoder_model.final_layer_norm", - "speech_transformer.transformer.norm": "speech_encoder_model.final_layer_norm", - "to_text_latent": "text_encoder_model.projection", - "to_speech_latent": "speech_encoder_model.projection", - "text_emb": "text_encoder_model.token_embedding", - "speech_emb": "speech_encoder_model.token_embedding", - "1.wrap.net.0": "mlp.fc1", - "1.wrap.net.3": "mlp.fc2", - "1.wrap": "self_attn", - "to_out": "out_proj", - "to_q": "q_proj", - "to_k": "k_proj", - "to_v": "v_proj", - "temperature": "logit_scale", -} - -CLVP_DECODER_MAPPING = { - "conditioning_encoder.init": "conditioning_encoder.mel_conv", - "conditioning_encoder.attn": "conditioning_encoder.mel_attn_blocks", - "mel_attn_blocks": "group_norms", - ".norm.weight": ".weight", - ".norm.bias": ".bias", - "text_embedding": "conditioning_encoder.text_token_embedding", - "text_pos_embedding.emb": "conditioning_encoder.text_position_embedding", - "final_norm": "speech_decoder_model.final_norm", - "mel_head": "speech_decoder_model.lm_head", - "gpt.ln_f": "speech_decoder_model.model.decoder.layer_norm", - "mel_embedding": "speech_decoder_model.model.decoder.input_embeds_layer", - "mel_pos_embedding.emb": "speech_decoder_model.model.decoder.position_embeds_layer", - "gpt.h": "speech_decoder_model.model.decoder.layers", - "ln_1": "input_layernorm", - "ln_2": "post_attention_layernorm", -} - - -def update_index(present_index): - if present_index % 2 == 0: - return int(present_index / 2) - else: - return int((present_index - 1) / 2) - - -def convert_encoder_weights(original_weights): - converted_weights = {} - original_weights_keys = sorted(original_weights.keys()) - for original_key in original_weights_keys: - updated_key = original_key - # for input_rmsnorm.weight and post_attention_rmsnorm.weight - if "0.0.g" in updated_key: - present_index = updated_key.split(".")[4] - if int(present_index) % 2 == 0: - updated_key = updated_key.replace("0.0.g", "input_rmsnorm.weight") - else: - updated_key = updated_key.replace("0.0.g", "post_attention_rmsnorm.weight") - - if "transformer.attn_layers.layers" in updated_key: - present_index = updated_key.split(".")[4] - updated_index = update_index(int(present_index)) - updated_key = updated_key.replace( - f"transformer.attn_layers.layers.{present_index}", f"transformer.attn_layers.layers.{updated_index}" - ) - - for k, v in CLVP_ENCODERS_MAPPING.items(): - if k in updated_key: - updated_key = updated_key.replace(k, v) - - converted_weights[updated_key] = original_weights.pop(original_key) - - return converted_weights - - -def convert_decoder_weights(original_weights): - converted_weights = {} - original_weights_keys = sorted(original_weights.keys()) - for original_key in original_weights_keys: - updated_key = original_key - if len(updated_key.split(".")) > 3: - index, attr = updated_key.split(".")[2], updated_key.split(".")[-1] - - # for decoder attention - if "attn.c_attn" in updated_key: - if attr == "weight": - slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).T.split(split_size=dim, dim=0) - else: - slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0) - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.q_proj.{attr}"] = slice1 - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.k_proj.{attr}"] = slice2 - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.v_proj.{attr}"] = slice3 - continue - - if "attn.c_proj" in updated_key: - converted_weights[f"speech_decoder_model.model.decoder.layers.{index}.attn.out_proj.{attr}"] = ( - original_weights[updated_key].squeeze(-1).T - ) - continue - - if "attn.bias" in updated_key or "attn.masked_bias" in updated_key or "text_head" in updated_key: - original_weights.pop(updated_key) - continue - - # conditional encoder attention - if "qkv" in updated_key: - if attr == "weight": - slice1, slice2, slice3 = original_weights[updated_key].squeeze(-1).split(split_size=dim, dim=0) - else: - slice1, slice2, slice3 = original_weights[updated_key].split(split_size=dim, dim=0) - - indices = torch.arange(dim) - index1, index2, index3 = ( - indices.unfold(0, sub_dim, sub_dim * 3).flatten(), - indices[sub_dim:].unfold(0, sub_dim, sub_dim * 3).flatten(), - indices[2 * sub_dim :].unfold(0, sub_dim, sub_dim * 3).flatten(), - ) - - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.q_proj.{attr}"] = torch.concatenate( - [slice1[index1], slice2[index3], slice3[index2]], - axis=0, - ) - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.k_proj.{attr}"] = torch.concatenate( - [slice1[index2], slice2[index1], slice3[index3]], - axis=0, - ) - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.v_proj.{attr}"] = torch.concatenate( - [slice1[index3], slice2[index2], slice3[index1]], - axis=0, - ) - continue - - if "proj_out" in updated_key: - converted_weights[f"conditioning_encoder.mel_attn_blocks.{index}.out_proj.{attr}"] = original_weights[ - updated_key - ].squeeze(-1) - continue - - for k, v in CLVP_DECODER_MAPPING.items(): - if k in updated_key: - updated_key = updated_key.replace(k, v) - - converted_weights[updated_key] = original_weights.pop(original_key) - - return converted_weights - - -def _download(url: str, root: str): - repo_id = f"{url.split('/')[3]}/{url.split('/')[4]}" - filename = f"{url.split('/')[-2]}/{url.split('/')[-1]}" - hf_hub_download( - repo_id=repo_id, - filename=filename, - force_filename=root, - local_dir_use_symlinks=False, - ) - - -def convert_clvp_weights(checkpoint_path, pytorch_dump_folder_path): - converted_checkpoint = {} - - for each_model_name, each_model_url in _MODELS.items(): - each_model_path = os.path.join(checkpoint_path, each_model_url.split("/")[-1]) - if not os.path.exists(each_model_path): - print(f"\n{each_model_name} was not found! Downloading it to {each_model_path}") - _download(url=each_model_url, root=each_model_path) - - if each_model_name == "clvp": - clvp_checkpoint = torch.load(each_model_path, map_location="cpu", weights_only=True) - else: - decoder_checkpoint = torch.load(each_model_path, map_location="cpu", weights_only=True) - - # Converting the weights - converted_checkpoint.update(**convert_encoder_weights(clvp_checkpoint)) - converted_checkpoint.update(**convert_decoder_weights(decoder_checkpoint)) - - config = ClvpConfig.from_pretrained("susnato/clvp_dev") - model = ClvpModelForConditionalGeneration(config) - - model.load_state_dict(converted_checkpoint, strict=True) - model.save_pretrained(pytorch_dump_folder_path) - print(f"Model saved at {pytorch_dump_folder_path}!") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # # Required parameters - parser.add_argument( - "--checkpoint_path", type=str, help="Path to the folder of downloaded checkpoints. (Please enter full path)" - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model. (Please enter full path)", - ) - args = parser.parse_args() - - convert_clvp_weights(args.checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py b/src/transformers/models/colpali/convert_colpali_weights_to_hf.py deleted file mode 100644 index 1b30f3f97a..0000000000 --- a/src/transformers/models/colpali/convert_colpali_weights_to_hf.py +++ /dev/null @@ -1,214 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert ColPali weights from the original repository to the HF model format. - -Original repository: https://github.com/illuin-tech/colpali. - -NOTE: This script was originally run using `torch==2.5.1` and with: - -```bash -python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.2-merged \ - --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.2-hf-internal \ - --push_to_hub - -python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.3-merged \ - --revision 5b955e3415a7c5468ab33119d98d6d45c3a5b2c3 \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.3-hf \ - --push_to_hub -``` -""" - -import argparse -import glob -from pathlib import Path -from typing import Any, Dict, Optional - -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import AutoConfig -from transformers.models.colpali import ColPaliForRetrieval -from transformers.models.colpali.configuration_colpali import ColPaliConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -ORIGINAL_DTYPE = torch.bfloat16 - - -def rename_state_dict_keys(state_dict: Dict[str, Any]) -> Dict[str, Any]: - new_state_dict = {} - for key, value in state_dict.items(): - new_key = key - if key.startswith("custom_text_proj"): - new_key = key.replace("custom_text_proj", "embedding_proj_layer") - if key.startswith("model."): - new_key = key.replace("model.", "vlm.", 1) - new_state_dict[new_key] = value - return new_state_dict - - -def load_original_state_dict(model_id: str, revision: Optional[str] = None) -> Dict[str, torch.Tensor]: - directory_path = snapshot_download( - repo_id=model_id, - revision=revision, - allow_patterns=["*.safetensors"], - ) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - # Some weights are tied, so `lm.head`` is not saved. Let's clone to load state dict. - if "lm_head.weight" not in original_state_dict: - original_state_dict["vlm.language_model.lm_head.weight"] = original_state_dict[ - "model.language_model.model.embed_tokens.weight" - ].clone() - - return original_state_dict - - -@torch.no_grad() -def convert_colpali_weights_to_hf( - model_id: str, - output_dir: str, - push_to_hub: bool, - revision: Optional[str] = None, - original_vlm_name_or_path: Optional[str] = None, -): - # Load the original model data - original_config = AutoConfig.from_pretrained( - model_id, - revision=revision, - ) - if original_vlm_name_or_path is not None: - original_config._name_or_path = original_vlm_name_or_path - if hasattr(original_config, "architectures"): - delattr(original_config, "architectures") - - original_state_dict = load_original_state_dict(model_id, revision=revision) - - # Format the state_dict keys - original_state_dict = rename_state_dict_keys(original_state_dict) - - # Create the new config - config = ColPaliConfig( - vlm_config=original_config, - embedding_dim=128, # hardcoded in the original model - ) - config.model_type = "colpali" - config.is_composition = False - - # Load the untrained model - model = ColPaliForRetrieval(config=config).to("cpu").eval() - print("Created model with new config and randomly initialized weights") - - # NOTE: The model was initialized with float32 weights. We need to convert it to the desired precision. - # There are two ways to set the model's dtype: - # - Using `model.from_pretrained(..., torch_dtype=dtype_precision)` doesn't convert the hyperparameters to the desired precision. - # - Using `model.to(dtype_precision)` converts all values - including the hyperparameters - to the desired precision. - # The following snippet allows a fine-grained control over the model's dtype, making sure that all - # the new weights' dtypes match the original model. - for param in model.parameters(): - param.data = param.data.to(ORIGINAL_DTYPE) - print(f"Converted the new model weights to `{ORIGINAL_DTYPE}`") - - # Load the original weights - model.load_state_dict(original_state_dict) - print("Loaded original model weights") - - # Tie the weights (following ColPali's `__init__`` step) - if model.vlm.language_model._tied_weights_keys is not None: - model._tied_weights_keys = [f"vlm.language_model.{k}" for k in model.vlm.language_model._tied_weights_keys] - - # Sanity check: ensure all keys are the same - state_dict_keys_old = set(original_state_dict.keys()) - state_dict_keys_new = set(model.state_dict().keys()) - disjoint_keys = state_dict_keys_old.symmetric_difference(state_dict_keys_new) - if disjoint_keys: - raise ValueError(f"Incompatible keys: {disjoint_keys}") - - # Save the model - if push_to_hub: - model.push_to_hub(output_dir, private=True) - print(f"Model pushed to the hub at `{output_dir}`") - else: - Path(output_dir).mkdir(exist_ok=True, parents=True) - model.save_pretrained(output_dir) - print(f"Model saved to `{output_dir}`") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - This script converts the original ColPali model to the HF model format. - - Example usage: - ```bash - python src/transformers/models/colpali/convert_colpali_weights_to_hf.py \ - --model_id vidore/colpali-v1.2-merged \ - --revision 89fd9736194236a1ecb7a9ec9b04f537f6f896af \ - --original_vlm_name_or_path google/paligemma-3b-mix-448 \ - --output_dir vidore/colpali-v1.2-hf \ - --push_to_hub - ``` - """ - ) - parser.add_argument( - "--model_id", - help="Model ID of the original model to convert", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally", - action="store_true", - default=False, - ) - parser.add_argument( - "--revision", - help="Revision of the model to download", - default=None, - ) - parser.add_argument( - "--original_vlm_name_or_path", - help="Name or path of the original VLM backbone model", - default=None, - ) - args = parser.parse_args() - - convert_colpali_weights_to_hf( - model_id=args.model_id, - output_dir=args.output_dir, - push_to_hub=args.push_to_hub, - revision=args.revision, - original_vlm_name_or_path=args.original_vlm_name_or_path, - ) diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 91f00668be..0000000000 --- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,324 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Conditional DETR checkpoints.""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ( - ConditionalDetrConfig, - ConditionalDetrForObjectDetection, - ConditionalDetrForSegmentation, - ConditionalDetrImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# here we list all keys to be renamed (original name on the left, our name on the right) -rename_keys = [] -for i in range(6): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.cross_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.cross_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - - # q, k, v projections in self/cross-attention in decoder for conditional DETR - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qcontent_proj.weight", f"decoder.layers.{i}.sa_qcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kcontent_proj.weight", f"decoder.layers.{i}.sa_kcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qpos_proj.weight", f"decoder.layers.{i}.sa_qpos_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kpos_proj.weight", f"decoder.layers.{i}.sa_kpos_proj.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.weight", f"decoder.layers.{i}.sa_v_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qcontent_proj.weight", f"decoder.layers.{i}.ca_qcontent_proj.weight") - ) - # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.weight", f"decoder.layers.{i}.ca_qpos_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kcontent_proj.weight", f"decoder.layers.{i}.ca_kcontent_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kpos_proj.weight", f"decoder.layers.{i}.ca_kpos_proj.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.weight", f"decoder.layers.{i}.ca_v_proj.weight")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.weight", f"decoder.layers.{i}.ca_qpos_sine_proj.weight") - ) - - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_qcontent_proj.bias", f"decoder.layers.{i}.sa_qcontent_proj.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.sa_kcontent_proj.bias", f"decoder.layers.{i}.sa_kcontent_proj.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_qpos_proj.bias", f"decoder.layers.{i}.sa_qpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_kpos_proj.bias", f"decoder.layers.{i}.sa_kpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.sa_v_proj.bias", f"decoder.layers.{i}.sa_v_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qcontent_proj.bias", f"decoder.layers.{i}.ca_qcontent_proj.bias") - ) - # rename_keys.append((f"transformer.decoder.layers.{i}.ca_qpos_proj.bias", f"decoder.layers.{i}.ca_qpos_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_kcontent_proj.bias", f"decoder.layers.{i}.ca_kcontent_proj.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_kpos_proj.bias", f"decoder.layers.{i}.ca_kpos_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.ca_v_proj.bias", f"decoder.layers.{i}.ca_v_proj.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.ca_qpos_sine_proj.bias", f"decoder.layers.{i}.ca_qpos_sine_proj.bias") - ) - -# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads -# for conditional DETR, also convert reference point head and query scale MLP -rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ("transformer.decoder.ref_point_head.layers.0.weight", "decoder.ref_point_head.layers.0.weight"), - ("transformer.decoder.ref_point_head.layers.0.bias", "decoder.ref_point_head.layers.0.bias"), - ("transformer.decoder.ref_point_head.layers.1.weight", "decoder.ref_point_head.layers.1.weight"), - ("transformer.decoder.ref_point_head.layers.1.bias", "decoder.ref_point_head.layers.1.bias"), - ("transformer.decoder.query_scale.layers.0.weight", "decoder.query_scale.layers.0.weight"), - ("transformer.decoder.query_scale.layers.0.bias", "decoder.query_scale.layers.0.bias"), - ("transformer.decoder.query_scale.layers.1.weight", "decoder.query_scale.layers.1.weight"), - ("transformer.decoder.query_scale.layers.1.bias", "decoder.query_scale.layers.1.bias"), - ("transformer.decoder.layers.0.ca_qpos_proj.weight", "decoder.layers.0.ca_qpos_proj.weight"), - ("transformer.decoder.layers.0.ca_qpos_proj.bias", "decoder.layers.0.ca_qpos_proj.bias"), - ] -) - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def rename_backbone_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if "backbone.0.body" in key: - new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") - new_state_dict[new_key] = value - else: - new_state_dict[key] = value - - return new_state_dict - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "conditional_detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_conditional_detr_checkpoint(model_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our CONDITIONAL_DETR structure. - """ - - # load default config - config = ConditionalDetrConfig() - # set backbone and dilation attributes - if "resnet101" in model_name: - config.backbone = "resnet101" - if "dc5" in model_name: - config.dilation = True - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - format = "coco_panoptic" if is_panoptic else "coco_detection" - image_processor = ConditionalDetrImageProcessor(format=format) - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info(f"Converting model {model_name}...") - - # load original model from torch hub - conditional_detr = torch.hub.load("DeppMeng/ConditionalDETR", model_name, pretrained=True).eval() - state_dict = conditional_detr.state_dict() - # rename keys - for src, dest in rename_keys: - if is_panoptic: - src = "conditional_detr." + src - rename_key(state_dict, src, dest) - state_dict = rename_backbone_keys(state_dict) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "conditional_detr.model." if is_panoptic else "model." - for key in state_dict.copy().keys(): - if is_panoptic: - if ( - key.startswith("conditional_detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["conditional_detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["conditional_detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = ConditionalDetrForSegmentation(config) if is_panoptic else ConditionalDetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - model.push_to_hub(repo_id=model_name, organization="DepuMeng", commit_message="Add model") - # verify our conversion - original_outputs = conditional_detr(pixel_values) - outputs = model(pixel_values) - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - default="conditional_detr_resnet50", - type=str, - help="Name of the CONDITIONAL_DETR model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_conditional_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py b/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py deleted file mode 100644 index 3d4ff77987..0000000000 --- a/src/transformers/models/convbert/convert_convbert_original_tf1_checkpoint_to_pytorch_and_tf2.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvBERT checkpoint.""" - -import argparse - -from transformers import ConvBertConfig, ConvBertModel, TFConvBertModel, load_tf_weights_in_convbert -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_orig_tf1_checkpoint_to_pytorch(tf_checkpoint_path, convbert_config_file, pytorch_dump_path): - conf = ConvBertConfig.from_json_file(convbert_config_file) - model = ConvBertModel(conf) - - model = load_tf_weights_in_convbert(model, conf, tf_checkpoint_path) - model.save_pretrained(pytorch_dump_path) - - tf_model = TFConvBertModel.from_pretrained(pytorch_dump_path, from_pt=True) - tf_model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--convbert_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained ConvBERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_orig_tf1_checkpoint_to_pytorch(args.tf_checkpoint_path, args.convbert_config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py deleted file mode 100644 index 27315ed73f..0000000000 --- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py +++ /dev/null @@ -1,242 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvNext checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ConvNeXt""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ConvNextConfig, ConvNextForImageClassification, ConvNextImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_convnext_config(checkpoint_url): - config = ConvNextConfig() - - if "tiny" in checkpoint_url: - depths = [3, 3, 9, 3] - hidden_sizes = [96, 192, 384, 768] - if "small" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [96, 192, 384, 768] - if "base" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [128, 256, 512, 1024] - if "large" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [192, 384, 768, 1536] - if "xlarge" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [256, 512, 1024, 2048] - - if "1k" in checkpoint_url: - num_labels = 1000 - filename = "imagenet-1k-id2label.json" - expected_shape = (1, 1000) - else: - num_labels = 21841 - filename = "imagenet-22k-id2label.json" - expected_shape = (1, 21841) - - repo_id = "huggingface/label-files" - config.num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - if "1k" not in checkpoint_url: - # this dataset contains 21843 labels but the model only has 21841 - # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 - del id2label[9205] - del id2label[15027] - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.hidden_sizes = hidden_sizes - config.depths = depths - - return config, expected_shape - - -def rename_key(name): - if "downsample_layers.0.0" in name: - name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings") - if "downsample_layers.0.1" in name: - name = name.replace("downsample_layers.0.1", "embeddings.norm") # we rename to layernorm later on - if "downsample_layers.1.0" in name: - name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0") - if "downsample_layers.1.1" in name: - name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1") - if "downsample_layers.2.0" in name: - name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0") - if "downsample_layers.2.1" in name: - name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1") - if "downsample_layers.3.0" in name: - name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0") - if "downsample_layers.3.1" in name: - name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1") - if "stages" in name and "downsampling_layer" not in name: - # stages.0.0. for instance should be renamed to stages.0.layers.0. - name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :] - if "stages" in name: - name = name.replace("stages", "encoder.stages") - if "norm" in name: - name = name.replace("norm", "layernorm") - if "gamma" in name: - name = name.replace("gamma", "layer_scale_parameter") - if "head" in name: - name = name.replace("head", "classifier") - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_convnext_checkpoint(checkpoint_url, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our ConvNext structure. - """ - - # define ConvNext configuration based on URL - config, expected_shape = get_convnext_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"] - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # add prefix to all keys expect classifier head - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - if not key.startswith("classifier"): - key = "convnext." + key - state_dict[key] = val - - # load HuggingFace model - model = ConvNextForImageClassification(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image, prepared by ConvNextImageProcessor - size = 224 if "224" in checkpoint_url else 384 - image_processor = ConvNextImageProcessor(size=size) - pixel_values = image_processor(images=prepare_img(), return_tensors="pt").pixel_values - - logits = model(pixel_values).logits - - # note: the logits below were obtained without center cropping - if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth": - expected_logits = torch.tensor([-0.1210, -0.6605, 0.1918]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth": - expected_logits = torch.tensor([-0.4473, -0.1847, -0.6365]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth": - expected_logits = torch.tensor([0.4525, 0.7539, 0.0308]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_384.pth": - expected_logits = torch.tensor([0.3561, 0.6350, -0.0384]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth": - expected_logits = torch.tensor([0.4174, -0.0989, 0.1489]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_384.pth": - expected_logits = torch.tensor([0.2513, -0.1349, -0.1613]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth": - expected_logits = torch.tensor([1.2980, 0.3631, -0.1198]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth": - expected_logits = torch.tensor([1.2963, 0.1227, 0.1723]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth": - expected_logits = torch.tensor([1.7956, 0.8390, 0.2820]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth": - expected_logits = torch.tensor([-0.2822, -0.0502, -0.0878]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth": - expected_logits = torch.tensor([-0.5672, -0.0730, -0.4348]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth": - expected_logits = torch.tensor([0.2681, 0.2365, 0.6246]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth": - expected_logits = torch.tensor([-0.2642, 0.3931, 0.5116]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth": - expected_logits = torch.tensor([-0.6677, -0.1873, -0.8379]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth": - expected_logits = torch.tensor([-0.7749, -0.2967, -0.6444]) - else: - raise ValueError(f"Unknown URL: {checkpoint_url}") - - assert torch.allclose(logits[0, :3], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - print("Pushing model to the hub...") - model_name = "convnext" - if "tiny" in checkpoint_url: - model_name += "-tiny" - elif "small" in checkpoint_url: - model_name += "-small" - elif "base" in checkpoint_url: - model_name += "-base" - elif "xlarge" in checkpoint_url: - model_name += "-xlarge" - elif "large" in checkpoint_url: - model_name += "-large" - if "224" in checkpoint_url: - model_name += "-224" - elif "384" in checkpoint_url: - model_name += "-384" - if "22k" in checkpoint_url and "1k" not in checkpoint_url: - model_name += "-22k" - if "22k" in checkpoint_url and "1k" in checkpoint_url: - model_name += "-22k-1k" - - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", - type=str, - help="URL of the original ConvNeXT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the output PyTorch model directory.", - ) - - args = parser.parse_args() - convert_convnext_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py b/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py deleted file mode 100644 index 8094ecf0d6..0000000000 --- a/src/transformers/models/convnextv2/convert_convnextv2_to_pytorch.py +++ /dev/null @@ -1,286 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ConvNeXTV2 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ConvNeXt""" - -import argparse -import json -import os - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import ConvNextImageProcessor, ConvNextV2Config, ConvNextV2ForImageClassification -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_convnextv2_config(checkpoint_url): - config = ConvNextV2Config() - - if "atto" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [40, 80, 160, 320] - if "femto" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [48, 96, 192, 384] - if "pico" in checkpoint_url: - depths = [2, 2, 6, 2] - hidden_sizes = [64, 128, 256, 512] - if "nano" in checkpoint_url: - depths = [2, 2, 8, 2] - hidden_sizes = [80, 160, 320, 640] - if "tiny" in checkpoint_url: - depths = [3, 3, 9, 3] - hidden_sizes = [96, 192, 384, 768] - if "base" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [128, 256, 512, 1024] - if "large" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [192, 384, 768, 1536] - if "huge" in checkpoint_url: - depths = [3, 3, 27, 3] - hidden_sizes = [352, 704, 1408, 2816] - - num_labels = 1000 - filename = "imagenet-1k-id2label.json" - expected_shape = (1, 1000) - - repo_id = "huggingface/label-files" - config.num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.hidden_sizes = hidden_sizes - config.depths = depths - - return config, expected_shape - - -def rename_key(name): - if "downsample_layers.0.0" in name: - name = name.replace("downsample_layers.0.0", "embeddings.patch_embeddings") - if "downsample_layers.0.1" in name: - name = name.replace("downsample_layers.0.1", "embeddings.norm") # we rename to layernorm later on - if "downsample_layers.1.0" in name: - name = name.replace("downsample_layers.1.0", "stages.1.downsampling_layer.0") - if "downsample_layers.1.1" in name: - name = name.replace("downsample_layers.1.1", "stages.1.downsampling_layer.1") - if "downsample_layers.2.0" in name: - name = name.replace("downsample_layers.2.0", "stages.2.downsampling_layer.0") - if "downsample_layers.2.1" in name: - name = name.replace("downsample_layers.2.1", "stages.2.downsampling_layer.1") - if "downsample_layers.3.0" in name: - name = name.replace("downsample_layers.3.0", "stages.3.downsampling_layer.0") - if "downsample_layers.3.1" in name: - name = name.replace("downsample_layers.3.1", "stages.3.downsampling_layer.1") - if "stages" in name and "downsampling_layer" not in name: - # stages.0.0. for instance should be renamed to stages.0.layers.0. - name = name[: len("stages.0")] + ".layers" + name[len("stages.0") :] - if "gamma" in name: - name = name.replace("gamma", "weight") - if "beta" in name: - name = name.replace("beta", "bias") - if "stages" in name: - name = name.replace("stages", "encoder.stages") - if "norm" in name: - name = name.replace("norm", "layernorm") - if "head" in name: - name = name.replace("head", "classifier") - - return name - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def convert_preprocessor(checkpoint_url): - if "224" in checkpoint_url: - size = 224 - crop_pct = 224 / 256 - elif "384" in checkpoint_url: - size = 384 - crop_pct = None - else: - size = 512 - crop_pct = None - - return ConvNextImageProcessor( - size=size, - crop_pct=crop_pct, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - resample=PILImageResampling.BICUBIC, - ) - - -@torch.no_grad() -def convert_convnextv2_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our ConvNeXTV2 structure. - """ - print("Downloading original model from checkpoint...") - # define ConvNeXTV2 configuration based on URL - config, expected_shape = get_convnextv2_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"] - - print("Converting model parameters...") - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # add prefix to all keys expect classifier head - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - if not key.startswith("classifier"): - key = "convnextv2." + key - state_dict[key] = val - - # load HuggingFace model - model = ConvNextV2ForImageClassification(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image, prepared by ConvNextImageProcessor - preprocessor = convert_preprocessor(checkpoint_url) - inputs = preprocessor(images=prepare_img(), return_tensors="pt") - logits = model(**inputs).logits - - # note: the logits below were obtained without center cropping - if checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt": - expected_logits = torch.tensor([-0.3930, 0.1747, -0.5246, 0.4177, 0.4295]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt": - expected_logits = torch.tensor([-0.1727, -0.5341, -0.7818, -0.4745, -0.6566]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt": - expected_logits = torch.tensor([-0.0333, 0.1563, -0.9137, 0.1054, 0.0381]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt": - expected_logits = torch.tensor([-0.1744, -0.1555, -0.0713, 0.0950, -0.1431]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt": - expected_logits = torch.tensor([0.9996, 0.1966, -0.4386, -0.3472, 0.6661]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt": - expected_logits = torch.tensor([-0.2553, -0.6708, -0.1359, 0.2518, -0.2488]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt": - expected_logits = torch.tensor([-0.0673, -0.5627, -0.3753, -0.2722, 0.0178]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt": - expected_logits = torch.tensor([-0.6377, -0.7458, -0.2150, 0.1184, -0.0597]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt": - expected_logits = torch.tensor([1.0799, 0.2322, -0.8860, 1.0219, 0.6231]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt": - expected_logits = torch.tensor([0.3766, 0.4917, -1.1426, 0.9942, 0.6024]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt": - expected_logits = torch.tensor([0.4220, -0.6919, -0.4317, -0.2881, -0.6609]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt": - expected_logits = torch.tensor([0.1082, -0.8286, -0.5095, 0.4681, -0.8085]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt": - expected_logits = torch.tensor([-0.2419, -0.6221, 0.2176, -0.0980, -0.7527]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt": - expected_logits = torch.tensor([0.0391, -0.4371, 0.3786, 0.1251, -0.2784]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt": - expected_logits = torch.tensor([-0.0504, 0.5636, -0.1729, -0.6507, -0.3949]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt": - expected_logits = torch.tensor([0.3560, 0.9486, 0.3149, -0.2667, -0.5138]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt": - expected_logits = torch.tensor([-0.2469, -0.4550, -0.5853, -0.0810, 0.0309]) - elif checkpoint_url == "https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt": - expected_logits = torch.tensor([-0.3090, 0.0802, -0.0682, -0.1979, -0.2826]) - else: - raise ValueError(f"Unknown URL: {checkpoint_url}") - - assert torch.allclose(logits[0, :5], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - print("Model outputs match the original results!") - - if save_model: - print("Saving model to local...") - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - - model.save_pretrained(pytorch_dump_folder_path) - preprocessor.save_pretrained(pytorch_dump_folder_path) - - model_name = "convnextv2" - if "atto" in checkpoint_url: - model_name += "-atto" - if "femto" in checkpoint_url: - model_name += "-femto" - if "pico" in checkpoint_url: - model_name += "-pico" - if "nano" in checkpoint_url: - model_name += "-nano" - elif "tiny" in checkpoint_url: - model_name += "-tiny" - elif "base" in checkpoint_url: - model_name += "-base" - elif "large" in checkpoint_url: - model_name += "-large" - elif "huge" in checkpoint_url: - model_name += "-huge" - if "22k" in checkpoint_url and "1k" not in checkpoint_url: - model_name += "-22k" - elif "22k" in checkpoint_url and "1k" in checkpoint_url: - model_name += "-22k-1k" - elif "1k" in checkpoint_url: - model_name += "-1k" - if "224" in checkpoint_url: - model_name += "-224" - elif "384" in checkpoint_url: - model_name += "-384" - elif "512" in checkpoint_url: - model_name += "-512" - - if push_to_hub: - print(f"Pushing {model_name} to the hub...") - model.push_to_hub(model_name) - preprocessor.push_to_hub(model_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt", - type=str, - help="URL of the original ConvNeXTV2 checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub") - - args = parser.parse_args() - convert_convnextv2_checkpoint( - args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub - ) diff --git a/src/transformers/models/csm/convert_csm.py b/src/transformers/models/csm/convert_csm.py deleted file mode 100644 index dc84e2cf3d..0000000000 --- a/src/transformers/models/csm/convert_csm.py +++ /dev/null @@ -1,339 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -import re - -import torch -from tokenizers.processors import TemplateProcessing - -from transformers import ( - AutoFeatureExtractor, - AutoTokenizer, - CsmConfig, - CsmDepthDecoderConfig, - CsmForConditionalGeneration, - CsmProcessor, - MimiModel, -) -from transformers.utils.hub import cached_file - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"backbone\.layers\.(\d+)": r"backbone_model.layers.\1", - r"decoder\.layers\.(\d+)": r"depth_decoder.model.layers.\1", - - r"attn": r"self_attn", - r"output_proj": r"o_proj", - r"w1": r"gate_proj", - r"w2": r"down_proj", - r"w3": r"up_proj", - - r"text_embeddings": r"embed_text_tokens", - r"audio_embeddings": r"backbone_model.embed_tokens.embed_audio_tokens", - - r"codebook0_head": r"lm_head", - r"audio_head": r"depth_decoder.codebooks_head.weight", - r"projection": r"depth_decoder.model.inputs_embeds_projector", - - r"sa_norm.scale": r"input_layernorm.weight", - r"mlp_norm.scale": r"post_attention_layernorm.weight", - r"decoder.norm.scale": r"depth_decoder.model.norm.weight", - r"backbone.norm.scale": r"backbone_model.norm.weight", -} -# fmt: on - - -def permute_for_rope(input_tensor, n_heads, dim1, dim2): - """ - When you go from the complex ROPE formulation to sin and cos one, you need - to permute the query and key weights (to avoid doing it on the fly) - """ - input_tensor = input_tensor.reshape(dim1, dim2) - input_tensor = input_tensor.view(n_heads, dim1 // n_heads // 2, 2, dim2) - input_tensor = input_tensor.transpose(1, 2).reshape(dim1, dim2) - return input_tensor - - -def convert_key(key, mapping): - for pattern, replacement in mapping.items(): - key = re.sub(pattern, replacement, key) - return key - - -def write_model( - input_path_or_repo, - model_name, - codec_model_path_or_repo, - output_dir, - safe_serialization=True, -): - print("Converting the model.") - os.makedirs(output_dir, exist_ok=True) - - codec_model = MimiModel.from_pretrained(codec_model_path_or_repo) - codec_model.config._attn_implementation_autoset = False - - # prepare rope scaling args: the model uses originally - # 1 - for the depth decoder - # rope_theta=500000, - # rope_scaling={ - # "factor": 32.0, - # "high_freq_factor": 4.0, - # "low_freq_factor": 1.0, - # "original_max_position_embeddings": 8192, - # "rope_type": "llama3", - # }, - # 2 - for the backbone - # rope_theta=500000, - # rope_scaling={ - # "factor": 32.0, - # "high_freq_factor": 4.0, - # "low_freq_factor": 1.0, - # "original_max_position_embeddings": 8192, - # "rope_type": "llama3", - # }, - # - # Yet we want to use max_position_embeddings=32, resp. 2048 - # This will throw warning as we would have original_max_position_embeddings >= max_position_embeddings - # Therefore, we convert values to equivalent ones - - depth_decoder_config = CsmDepthDecoderConfig( - rope_scaling={ - "factor": 32.0, - "high_freq_factor": 0.0078125, - "low_freq_factor": 0.001953125, - "original_max_position_embeddings": 16, - "rope_type": "llama3", - }, - ) - - config = CsmConfig( - codec_config=codec_model.config, - depth_decoder_config=depth_decoder_config, - rope_scaling={ - "factor": 32.0, - "high_freq_factor": 0.5, - "low_freq_factor": 0.125, - "original_max_position_embeddings": 1024, - "rope_type": "llama3", - }, - ) - - params = { - "backbone": { - "num_attention_heads": config.num_attention_heads, - "num_key_value_heads": config.num_key_value_heads, - "dim_per_head": config.head_dim, - "key_value_dim": config.head_dim * config.num_key_value_heads, - "dim": config.hidden_size, - }, - "depth_decoder": { - "num_attention_heads": config.depth_decoder_config.num_attention_heads, - "num_key_value_heads": config.depth_decoder_config.num_key_value_heads, - "dim_per_head": config.depth_decoder_config.head_dim, - "key_value_dim": config.depth_decoder_config.head_dim * config.depth_decoder_config.num_key_value_heads, - "dim": config.depth_decoder_config.hidden_size, - }, - } - - model_path = cached_file( - input_path_or_repo, - model_name, - ) - print(f"Fetching all parameters from the checkpoint at {model_path}...") - loaded = torch.load(model_path, map_location="cpu") - - print("Converting model...") - state_dict = {} - - # ----------------------- - # convert parameter names - # ----------------------- - - # Add codec_model. prefix to every key in the codec model state dict - codec_state_dict = {f"codec_model.{k}": v for k, v in codec_model.state_dict().items()} - state_dict.update(codec_state_dict) - - for key, value in loaded.items(): - new_key = convert_key(key, ORIGINAL_TO_CONVERTED_KEY_MAPPING) - current_parameter = value - - # Post-process the current_parameter. - if re.search("(k|q)_proj.weight", new_key): - params_keys = "backbone" if "backbone" in new_key else "depth_decoder" - if "q_proj" in new_key: - num_heads = params[params_keys]["num_attention_heads"] - dim_per_head = params[params_keys]["dim_per_head"] - param_dim = params[params_keys]["dim"] - dim = params[params_keys]["dim"] - else: - num_heads = params[params_keys]["num_key_value_heads"] - dim_per_head = params[params_keys]["dim_per_head"] - param_dim = params[params_keys]["key_value_dim"] - dim = params[params_keys]["dim"] - - current_parameter = permute_for_rope(value, num_heads, param_dim, dim) - state_dict[new_key] = current_parameter.reshape(num_heads * dim_per_head, dim) - - state_dict[new_key] = current_parameter - - # add the depth decoder embed audio tokens weights, latter tied to the backbone embed audio tokens weights - state_dict["depth_decoder.model.embed_tokens.weight"] = state_dict[ - "backbone_model.embed_tokens.embed_audio_tokens.weight" - ].clone() - del loaded - gc.collect() - - # ------------------------- - # load the weights and save - # ------------------------- - - print("Loading the checkpoint in a Csm model.") - with torch.device("meta"): - model = CsmForConditionalGeneration(config) - model.load_state_dict(state_dict, strict=True, assign=True) - print("Checkpoint loaded successfully.") - del model.config._name_or_path - - # default generation config - model.generation_config._from_model_config = False - model.generation_config.max_new_tokens = 125 - model.generation_config.do_sample = True - model.generation_config.top_k = 50 - model.generation_config.temperature = 0.9 - model.generation_config.depth_decoder_do_sample = True - model.generation_config.depth_decoder_top_k = 50 - model.generation_config.depth_decoder_temperature = 0.9 - - print("Saving the model.") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - CsmForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") - print("Model reloaded successfully.") - - -def write_tokenizer(output_dir): - # from https://github.com/SesameAILabs/csm/blob/2d720827843b653c4d67bb4445b1c0a4f59e646f/generator.py#L22-L36 - def load_llama3_tokenizer(): - """ - https://github.com/huggingface/transformers/issues/22794#issuecomment-2092623992 - """ - tokenizer_name = "meta-llama/Llama-3.2-1B" - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - bos = tokenizer.bos_token - eos = tokenizer.eos_token - tokenizer._tokenizer.post_processor = TemplateProcessing( - single=f"{bos}:0 $A:0 {eos}:0", - pair=f"{bos}:0 $A:0 {eos}:0 {bos}:1 $B:1 {eos}:1", - special_tokens=[(f"{bos}", tokenizer.bos_token_id), (f"{eos}", tokenizer.eos_token_id)], - ) - - return tokenizer - - tokenizer = load_llama3_tokenizer() - tokenizer.pad_token = tokenizer.eos_token - tokenizer.save_pretrained(output_dir) - - # manually modify in tokenizer_config.json - # "128002": { - # "content": "<|AUDIO|>", - # ... - # } - # "128003": { - # "content": "<|audio_eos|>", - # ... - # } - print( - "Tokenizer saved successfully. Please manually modify in tokenizer_config.json AND tokenizer.json as follows: " - ) - print(""" - # "128002": { - # "content": "<|AUDIO|>", - # ... - # } - # "128003": { - # "content": "<|audio_eos|>", - # ... - # } - """) - - -def write_processor(output_dir, codec_model_path_or_repo): - chat_template = "\n{%- for message in messages %}\n {#-- Validate role is a stringified integer --#}\n {%- if not message['role'] is string or not message['role'].isdigit() %}\n {{- raise_exception(\"The role must be an integer or a stringified integer (e.g. '0') designating the speaker id\") }}\n {%- endif %}\n\n {#-- Validate content is a list --#}\n {%- set content = message['content'] %}\n {%- if content is not iterable or content is string %}\n {{- raise_exception(\"The content must be a list\") }}\n {%- endif %}\n\n {#-- Collect content types --#}\n {%- set content_types = content | map(attribute='type') | list %}\n {%- set is_last = loop.last %}\n\n {#-- Last message validation --#}\n {%- if is_last %}\n {%- if 'text' not in content_types %}\n {{- raise_exception(\"The last message must include one item of type 'text'\") }}\n {%- elif (content_types | select('equalto', 'text') | list | length > 1) or (content_types | select('equalto', 'audio') | list | length > 1) %}\n {{- raise_exception(\"At most two items are allowed in the last message: one 'text' and one 'audio'\") }}\n {%- endif %}\n\n {#-- All other messages validation --#}\n {%- else %}\n {%- if content_types | select('equalto', 'text') | list | length != 1\n or content_types | select('equalto', 'audio') | list | length != 1 %}\n {{- raise_exception(\"Each message (except the last) must contain exactly one 'text' and one 'audio' item\") }}\n {%- elif content_types | reject('in', ['text', 'audio']) | list | length > 0 %}\n {{- raise_exception(\"Only 'text' and 'audio' types are allowed in content\") }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n\n{%- for message in messages %}\n {{- bos_token }}\n {{- '[' + message['role'] + ']' }}\n {{- message['content'][0]['text'] }}\n {{- eos_token }}\n {%- if message['content']|length > 1 %}\n {{- '<|AUDIO|><|audio_eos|>' }}\n {%- endif %}\n{%- endfor %}\n" - tokenizer = AutoTokenizer.from_pretrained(output_dir) - feature_extractor = AutoFeatureExtractor.from_pretrained(codec_model_path_or_repo) - - processor = CsmProcessor( - tokenizer=tokenizer, - feature_extractor=feature_extractor, - chat_template=chat_template, - ) - - processor.save_pretrained(output_dir) - print("Processor saved successfully.") - - -def main(): - parser = argparse.ArgumentParser(description="Convert Csm weights to HuggingFace format") - parser.add_argument( - "--input_path_or_repo", - type=str, - required=True, - help="Path or repo containing Csm weights", - ) - parser.add_argument( - "--model_name", - type=str, - required=True, - help="Name of the model in input_path_or_repo", - ) - parser.add_argument( - "--codec_model_path_or_repo", - type=str, - required=True, - help="Path or repo containing the codec model", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--safe_serialization", action="store_true", default=True, help="Whether or not to save using `safetensors`." - ) - args = parser.parse_args() - - write_model( - args.input_path_or_repo, - args.model_name, - args.codec_model_path_or_repo, - output_dir=args.output_dir, - safe_serialization=args.safe_serialization, - ) - - write_tokenizer(args.output_dir) - - write_processor(args.output_dir, args.codec_model_path_or_repo) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 85a1d4b64a..0000000000 --- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,362 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert CvT checkpoints from the original repository. - -URL: https://github.com/microsoft/CvT""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import torch -from huggingface_hub import hf_hub_download - -from transformers import AutoImageProcessor, CvtConfig, CvtForImageClassification - - -def embeddings(idx): - """ - The function helps in renaming embedding layer weights. - - Args: - idx: stage number in original model - """ - embed = [] - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.weight", - f"stage{idx}.patch_embed.proj.weight", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.projection.bias", - f"stage{idx}.patch_embed.proj.bias", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.weight", - f"stage{idx}.patch_embed.norm.weight", - ) - ) - embed.append( - ( - f"cvt.encoder.stages.{idx}.embedding.convolution_embeddings.normalization.bias", - f"stage{idx}.patch_embed.norm.bias", - ) - ) - return embed - - -def attention(idx, cnt): - """ - The function helps in renaming attention block layers weights. - - Args: - idx: stage number in original model - cnt: count of blocks in each stage - """ - attention_weights = [] - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_query.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_q.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_key.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_k.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.convolution.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.conv.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.weight", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.bias", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_mean", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_mean", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.running_var", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.running_var", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.convolution_projection_value.convolution_projection.normalization.num_batches_tracked", - f"stage{idx}.blocks.{cnt}.attn.conv_proj_v.bn.num_batches_tracked", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_q.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_query.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_q.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_k.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_key.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_k.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.weight", - f"stage{idx}.blocks.{cnt}.attn.proj_v.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.attention.projection_value.bias", - f"stage{idx}.blocks.{cnt}.attn.proj_v.bias", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.weight", - f"stage{idx}.blocks.{cnt}.attn.proj.weight", - ) - ) - attention_weights.append( - ( - f"cvt.encoder.stages.{idx}.layers.{cnt}.attention.output.dense.bias", - f"stage{idx}.blocks.{cnt}.attn.proj.bias", - ) - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc1.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.intermediate.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc1.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.weight", f"stage{idx}.blocks.{cnt}.mlp.fc2.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.output.dense.bias", f"stage{idx}.blocks.{cnt}.mlp.fc2.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.weight", f"stage{idx}.blocks.{cnt}.norm1.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_before.bias", f"stage{idx}.blocks.{cnt}.norm1.bias") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.weight", f"stage{idx}.blocks.{cnt}.norm2.weight") - ) - attention_weights.append( - (f"cvt.encoder.stages.{idx}.layers.{cnt}.layernorm_after.bias", f"stage{idx}.blocks.{cnt}.norm2.bias") - ) - return attention_weights - - -def cls_token(idx): - """ - Function helps in renaming cls_token weights - """ - token = [] - token.append((f"cvt.encoder.stages.{idx}.cls_token", "stage2.cls_token")) - return token - - -def final(): - """ - Function helps in renaming final classification layer - """ - head = [] - head.append(("layernorm.weight", "norm.weight")) - head.append(("layernorm.bias", "norm.bias")) - head.append(("classifier.weight", "head.weight")) - head.append(("classifier.bias", "head.bias")) - return head - - -def convert_cvt_checkpoint(cvt_model, image_size, cvt_file_name, pytorch_dump_folder): - """ - Function to convert the microsoft cvt checkpoint to huggingface checkpoint - """ - img_labels_file = "imagenet-1k-id2label.json" - num_labels = 1000 - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.loads(Path(hf_hub_download(repo_id, img_labels_file, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - config = config = CvtConfig(num_labels=num_labels, id2label=id2label, label2id=label2id) - - # For depth size 13 (13 = 1+2+10) - if cvt_model.rsplit("/", 1)[-1][4:6] == "13": - config.depth = [1, 2, 10] - - # For depth size 21 (21 = 1+4+16) - elif cvt_model.rsplit("/", 1)[-1][4:6] == "21": - config.depth = [1, 4, 16] - - # For wide cvt (similar to wide-resnet) depth size 24 (w24 = 2 + 2 20) - else: - config.depth = [2, 2, 20] - config.num_heads = [3, 12, 16] - config.embed_dim = [192, 768, 1024] - - model = CvtForImageClassification(config) - image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k") - image_processor.size["shortest_edge"] = image_size - original_weights = torch.load(cvt_file_name, map_location=torch.device("cpu"), weights_only=True) - - huggingface_weights = OrderedDict() - list_of_state_dict = [] - - for idx in range(len(config.depth)): - if config.cls_token[idx]: - list_of_state_dict = list_of_state_dict + cls_token(idx) - list_of_state_dict = list_of_state_dict + embeddings(idx) - for cnt in range(config.depth[idx]): - list_of_state_dict = list_of_state_dict + attention(idx, cnt) - - list_of_state_dict = list_of_state_dict + final() - for gg in list_of_state_dict: - print(gg) - for i in range(len(list_of_state_dict)): - huggingface_weights[list_of_state_dict[i][0]] = original_weights[list_of_state_dict[i][1]] - - model.load_state_dict(huggingface_weights) - model.save_pretrained(pytorch_dump_folder) - image_processor.save_pretrained(pytorch_dump_folder) - - -# Download the weights from zoo: https://1drv.ms/u/s!AhIXJn_J-blW9RzF3rMW7SsLHa8h?e=blQ0Al - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--cvt_model", - default="cvt-w24", - type=str, - help="Name of the cvt model you'd like to convert.", - ) - parser.add_argument( - "--image_size", - default=384, - type=int, - help="Input Image Size", - ) - parser.add_argument( - "--cvt_file_name", - default=r"cvtmodels\CvT-w24-384x384-IN-22k.pth", - type=str, - help="Input Image Size", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - - args = parser.parse_args() - convert_cvt_checkpoint(args.cvt_model, args.image_size, args.cvt_file_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py b/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py deleted file mode 100644 index 8a344c520e..0000000000 --- a/src/transformers/models/d_fine/convert_d_fine_original_pytorch_checkpoint_to_hf.py +++ /dev/null @@ -1,689 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json -import re -from pathlib import Path -from typing import Optional - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import DFineConfig, DFineForObjectDetection, RTDetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_d_fine_config(model_name: str) -> DFineConfig: - config = DFineConfig() - - config.num_labels = 80 - repo_id = "huggingface/label-files" - filename = "object365-id2label.json" if "obj365" in model_name else "coco-detection-mmdet-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - config.backbone_config.hidden_sizes = [64, 128, 256, 512] - config.backbone_config.layer_type = "basic" - config.backbone_config.embedding_size = 32 - config.hidden_expansion = 1.0 - config.decoder_layers = 6 - - if model_name in ["dfine_x_coco", "dfine_x_obj2coco", "dfine_x_obj365"]: - config.backbone_config.hidden_sizes = [256, 512, 1024, 2048] - config.backbone_config.stage_in_channels = [64, 128, 512, 1024] - config.backbone_config.stage_mid_channels = [64, 128, 256, 512] - config.backbone_config.stage_out_channels = [128, 512, 1024, 2048] - config.backbone_config.stage_num_blocks = [1, 2, 5, 2] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [6, 6, 6, 6] - config.backbone_config.stem_channels = [3, 32, 64] - config.encoder_in_channels = [512, 1024, 2048] - config.encoder_hidden_dim = 384 - config.encoder_ffn_dim = 2048 - config.decoder_n_points = [3, 6, 3] - config.decoder_in_channels = [384, 384, 384] - if model_name == "dfine_x_obj365": - config.num_labels = 366 - elif model_name in ["dfine_m_coco", "dfine_m_obj2coco", "dfine_m_obj365"]: - config.backbone_config.hidden_sizes = [192, 384, 768, 1536] - config.backbone_config.stem_channels = [3, 24, 32] - config.backbone_config.stage_in_channels = [32, 96, 384, 768] - config.backbone_config.stage_mid_channels = [32, 64, 128, 256] - config.backbone_config.stage_out_channels = [96, 384, 768, 1536] - config.backbone_config.stage_num_blocks = [1, 1, 3, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [4, 4, 4, 4] - config.decoder_layers = 4 - config.decoder_n_points = [3, 6, 3] - config.encoder_in_channels = [384, 768, 1536] - config.backbone_config.use_learnable_affine_block = True - config.depth_mult = 0.67 - if model_name == "dfine_m_obj365": - config.num_labels = 366 - elif model_name in ["dfine_l_coco", "dfine_l_obj2coco_e25", "dfine_l_obj365"]: - config.backbone_config.hidden_sizes = [256, 512, 1024, 2048] - config.backbone_config.stem_channels = [3, 32, 48] - config.backbone_config.stage_in_channels = [48, 128, 512, 1024] - config.backbone_config.stage_mid_channels = [48, 96, 192, 384] - config.backbone_config.stage_out_channels = [128, 512, 1024, 2048] - config.backbone_config.stage_num_blocks = [1, 1, 3, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [6, 6, 6, 6] - config.encoder_ffn_dim = 1024 - config.encoder_in_channels = [512, 1024, 2048] - config.decoder_n_points = [3, 6, 3] - if model_name == "dfine_l_obj365": - config.num_labels = 366 - elif model_name in ["dfine_n_coco", "dfine_n_obj2coco_e25", "dfine_n_obj365"]: - config.backbone_config.hidden_sizes = [128, 256, 512, 1024] - config.backbone_config.stem_channels = [3, 16, 16] - config.backbone_config.stage_in_channels = [16, 64, 256, 512] - config.backbone_config.stage_mid_channels = [16, 32, 64, 128] - config.backbone_config.stage_out_channels = [64, 256, 512, 1024] - config.backbone_config.stage_num_blocks = [1, 1, 2, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [3, 3, 3, 3] - config.backbone_config.out_indices = [3, 4] - config.backbone_config.use_learnable_affine_block = True - config.num_feature_levels = 2 - config.encoder_ffn_dim = 512 - config.encode_proj_layers = [1] - config.d_model = 128 - config.encoder_hidden_dim = 128 - config.decoder_ffn_dim = 512 - config.encoder_in_channels = [512, 1024] - config.decoder_n_points = [6, 6] - config.decoder_in_channels = [128, 128] - config.feat_strides = [16, 32] - config.depth_mult = 0.5 - config.decoder_layers = 3 - config.hidden_expansion = 0.34 - if model_name == "dfine_n_obj365": - config.num_labels = 366 - else: - config.backbone_config.hidden_sizes = [128, 256, 512, 1024] - config.backbone_config.stem_channels = [3, 16, 16] - config.backbone_config.stage_in_channels = [16, 64, 256, 512] - config.backbone_config.stage_mid_channels = [16, 32, 64, 128] - config.backbone_config.stage_out_channels = [64, 256, 512, 1024] - config.backbone_config.stage_num_blocks = [1, 1, 2, 1] - config.backbone_config.stage_downsample = [False, True, True, True] - config.backbone_config.stage_light_block = [False, False, True, True] - config.backbone_config.stage_kernel_size = [3, 3, 5, 5] - config.backbone_config.stage_numb_of_layers = [3, 3, 3, 3] - config.decoder_layers = 3 - config.hidden_expansion = 0.5 - config.depth_mult = 0.34 - config.decoder_n_points = [3, 6, 3] - config.encoder_in_channels = [256, 512, 1024] - config.backbone_config.use_learnable_affine_block = True - if model_name == "dfine_s_obj365": - config.num_labels = 366 - - return config - - -def load_original_state_dict(repo_id, model_name): - directory_path = hf_hub_download(repo_id=repo_id, filename=f"{model_name}.pth") - - original_state_dict = {} - model = torch.load(directory_path, map_location="cpu")["model"] - for key in model.keys(): - original_state_dict[key] = model[key] - - return original_state_dict - - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Decoder base mappings - r"decoder.valid_mask": r"model.decoder.valid_mask", - r"decoder.anchors": r"model.decoder.anchors", - r"decoder.up": r"model.decoder.up", - r"decoder.reg_scale": r"model.decoder.reg_scale", - # Backbone stem mappings - including stem2a and stem2b - r"backbone.stem.stem1.conv.weight": r"model.backbone.model.embedder.stem1.convolution.weight", - r"backbone.stem.stem2a.conv.weight": r"model.backbone.model.embedder.stem2a.convolution.weight", - r"backbone.stem.stem2b.conv.weight": r"model.backbone.model.embedder.stem2b.convolution.weight", - r"backbone.stem.stem3.conv.weight": r"model.backbone.model.embedder.stem3.convolution.weight", - r"backbone.stem.stem4.conv.weight": r"model.backbone.model.embedder.stem4.convolution.weight", - # Stem normalization - r"backbone.stem.stem1.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem1.normalization.\1", - r"backbone.stem.stem2a.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem2a.normalization.\1", - r"backbone.stem.stem2b.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem2b.normalization.\1", - r"backbone.stem.stem3.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem3.normalization.\1", - r"backbone.stem.stem4.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.embedder.stem4.normalization.\1", - # Stem lab parameters - fixed with .lab in the path - r"backbone.stem.stem1.lab.(scale|bias)": r"model.backbone.model.embedder.stem1.lab.\1", - r"backbone.stem.stem2a.lab.(scale|bias)": r"model.backbone.model.embedder.stem2a.lab.\1", - r"backbone.stem.stem2b.lab.(scale|bias)": r"model.backbone.model.embedder.stem2b.lab.\1", - r"backbone.stem.stem3.lab.(scale|bias)": r"model.backbone.model.embedder.stem3.lab.\1", - r"backbone.stem.stem4.lab.(scale|bias)": r"model.backbone.model.embedder.stem4.lab.\1", - # Backbone stages mappings - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.normalization.\4", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv1.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv1.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv2.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv2.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv1.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv1.normalization.\4", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv2.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv2.normalization.\4", - # Backbone stages aggregation - r"backbone.stages.(\d+).blocks.(\d+).aggregation.0.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.0.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.1.conv.weight": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.1.convolution.weight", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.0.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.0.normalization.\3", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.1.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.1.normalization.\3", - # Backbone stages lab parameters for aggregation - r"backbone.stages.(\d+).blocks.(\d+).aggregation.0.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.0.lab.\3", - r"backbone.stages.(\d+).blocks.(\d+).aggregation.1.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.aggregation.1.lab.\3", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.lab.\4", - # Conv1/Conv2 layers with lab - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv1.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv1.lab.\4", - r"backbone.stages.(\d+).blocks.(\d+).layers.(\d+).conv2.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.blocks.\2.layers.\3.conv2.lab.\4", - # Downsample with lab - r"backbone.stages.(\d+).downsample.lab.(scale|bias)": r"model.backbone.model.encoder.stages.\1.downsample.lab.\2", - # Backbone downsample - r"backbone.stages.(\d+).downsample.conv.weight": r"model.backbone.model.encoder.stages.\1.downsample.convolution.weight", - r"backbone.stages.(\d+).downsample.bn.(weight|bias|running_mean|running_var)": r"model.backbone.model.encoder.stages.\1.downsample.normalization.\2", - # Encoder mappings - r"encoder.encoder.(\d+).layers.0.self_attn.out_proj.(weight|bias)": r"model.encoder.encoder.\1.layers.0.self_attn.out_proj.\2", - r"encoder.encoder.(\d+).layers.0.linear1.(weight|bias)": r"model.encoder.encoder.\1.layers.0.fc1.\2", - r"encoder.encoder.(\d+).layers.0.linear2.(weight|bias)": r"model.encoder.encoder.\1.layers.0.fc2.\2", - r"encoder.encoder.(\d+).layers.0.norm1.(weight|bias)": r"model.encoder.encoder.\1.layers.0.self_attn_layer_norm.\2", - r"encoder.encoder.(\d+).layers.0.norm2.(weight|bias)": r"model.encoder.encoder.\1.layers.0.final_layer_norm.\2", - # Encoder projections and convolutions - r"encoder.input_proj.(\d+).conv.weight": r"model.encoder_input_proj.\1.0.weight", - r"encoder.input_proj.(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder_input_proj.\1.1.\2", - r"encoder.lateral_convs.(\d+).conv.weight": r"model.encoder.lateral_convs.\1.conv.weight", - r"encoder.lateral_convs.(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.lateral_convs.\1.norm.\2", - # FPN blocks - complete structure - # Basic convolutions - r"encoder.fpn_blocks.(\d+).cv1.conv.weight": r"model.encoder.fpn_blocks.\1.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv1.norm.\2", - # CSP Rep1 path - r"encoder.fpn_blocks.(\d+).cv2.0.conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.conv1.norm.\2", - r"encoder.fpn_blocks.(\d+).cv2.0.conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.conv2.norm.\2", - r"encoder.fpn_blocks.(\d+).cv2.1.conv.weight": r"model.encoder.fpn_blocks.\1.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv2.norm.\2", - # CSP Rep2 path - r"encoder.fpn_blocks.(\d+).cv3.0.conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.conv1.norm.\2", - r"encoder.fpn_blocks.(\d+).cv3.0.conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.conv2.norm.\2", - r"encoder.fpn_blocks.(\d+).cv3.1.conv.weight": r"model.encoder.fpn_blocks.\1.conv3.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv3.norm.\2", - # Final conv - r"encoder.fpn_blocks.(\d+).cv4.conv.weight": r"model.encoder.fpn_blocks.\1.conv4.conv.weight", - r"encoder.fpn_blocks.(\d+).cv4.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.conv4.norm.\2", - # Bottlenecks for CSP Rep1 - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv1.norm.\3", - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep1.bottlenecks.\2.conv2.norm.\3", - # Bottlenecks for CSP Rep2 - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv1.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv1.norm.\3", - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv2.conv.weight", - r"encoder.fpn_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.fpn_blocks.\1.csp_rep2.bottlenecks.\2.conv2.norm.\3", - # PAN blocks - complete structure - # Basic convolutions - r"encoder.pan_blocks.(\d+).cv1.conv.weight": r"model.encoder.pan_blocks.\1.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv1.norm.\2", - # CSP Rep1 path - r"encoder.pan_blocks.(\d+).cv2.0.conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.conv1.norm.\2", - r"encoder.pan_blocks.(\d+).cv2.0.conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.conv2.norm.\2", - r"encoder.pan_blocks.(\d+).cv2.1.conv.weight": r"model.encoder.pan_blocks.\1.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv2.norm.\2", - # CSP Rep2 path - r"encoder.pan_blocks.(\d+).cv3.0.conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.conv1.norm.\2", - r"encoder.pan_blocks.(\d+).cv3.0.conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.conv2.norm.\2", - r"encoder.pan_blocks.(\d+).cv3.1.conv.weight": r"model.encoder.pan_blocks.\1.conv3.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv3.norm.\2", - # Final conv - r"encoder.pan_blocks.(\d+).cv4.conv.weight": r"model.encoder.pan_blocks.\1.conv4.conv.weight", - r"encoder.pan_blocks.(\d+).cv4.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.conv4.norm.\2", - # Bottlenecks for CSP Rep1 - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv1.norm.\3", - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv2.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep1.bottlenecks.\2.conv2.norm.\3", - # Bottlenecks for CSP Rep2 - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv1.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv1.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv1.norm.\3", - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.conv.weight": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv2.conv.weight", - r"encoder.pan_blocks.(\d+).cv3.0.bottlenecks.(\d+).conv2.norm.(weight|bias|running_mean|running_var)": r"model.encoder.pan_blocks.\1.csp_rep2.bottlenecks.\2.conv2.norm.\3", - # Downsample convolutions - r"encoder.downsample_convs.(\d+).0.cv(\d+).conv.weight": r"model.encoder.downsample_convs.\1.conv\2.conv.weight", - r"encoder.downsample_convs.(\d+).0.cv(\d+).norm.(weight|bias|running_mean|running_var)": r"model.encoder.downsample_convs.\1.conv\2.norm.\3", - # Decoder layers - r"decoder.decoder.layers.(\d+).self_attn.out_proj.(weight|bias)": r"model.decoder.layers.\1.self_attn.out_proj.\2", - r"decoder.decoder.layers.(\d+).cross_attn.sampling_offsets.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.sampling_offsets.\2", - r"decoder.decoder.layers.(\d+).cross_attn.attention_weights.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.attention_weights.\2", - r"decoder.decoder.layers.(\d+).cross_attn.value_proj.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.value_proj.\2", - r"decoder.decoder.layers.(\d+).cross_attn.output_proj.(weight|bias)": r"model.decoder.layers.\1.encoder_attn.output_proj.\2", - r"decoder.decoder.layers.(\d+).cross_attn.num_points_scale": r"model.decoder.layers.\1.encoder_attn.num_points_scale", - r"decoder.decoder.layers.(\d+).gateway.gate.(weight|bias)": r"model.decoder.layers.\1.gateway.gate.\2", - r"decoder.decoder.layers.(\d+).gateway.norm.(weight|bias)": r"model.decoder.layers.\1.gateway.norm.\2", - r"decoder.decoder.layers.(\d+).norm1.(weight|bias)": r"model.decoder.layers.\1.self_attn_layer_norm.\2", - r"decoder.decoder.layers.(\d+).norm2.(weight|bias)": r"model.decoder.layers.\1.encoder_attn_layer_norm.\2", - r"decoder.decoder.layers.(\d+).norm3.(weight|bias)": r"model.decoder.layers.\1.final_layer_norm.\2", - r"decoder.decoder.layers.(\d+).linear1.(weight|bias)": r"model.decoder.layers.\1.fc1.\2", - r"decoder.decoder.layers.(\d+).linear2.(weight|bias)": r"model.decoder.layers.\1.fc2.\2", - # LQE layers - r"decoder.decoder.lqe_layers.(\d+).reg_conf.layers.(\d+).(weight|bias)": r"model.decoder.lqe_layers.\1.reg_conf.layers.\2.\3", - # Decoder heads and projections - r"decoder.dec_score_head.(\d+).(weight|bias)": r"model.decoder.class_embed.\1.\2", - r"decoder.dec_bbox_head.(\d+).layers.(\d+).(weight|bias)": r"model.decoder.bbox_embed.\1.layers.\2.\3", - r"decoder.pre_bbox_head.layers.(\d+).(weight|bias)": r"model.decoder.pre_bbox_head.layers.\1.\2", - r"decoder.input_proj.(\d+).conv.weight": r"model.decoder_input_proj.\1.0.weight", - r"decoder.input_proj.(\d+).norm.(weight|bias|running_mean|running_var)": r"model.decoder_input_proj.\1.1.\2", - # Other decoder components - r"decoder.denoising_class_embed.weight": r"model.denoising_class_embed.weight", - r"decoder.query_pos_head.layers.(\d+).(weight|bias)": r"model.decoder.query_pos_head.layers.\1.\2", - r"decoder.enc_output.proj.(weight|bias)": r"model.enc_output.0.\1", - r"decoder.enc_output.norm.(weight|bias)": r"model.enc_output.1.\1", - r"decoder.enc_score_head.(weight|bias)": r"model.enc_score_head.\1", - r"decoder.enc_bbox_head.layers.(\d+).(weight|bias)": r"model.enc_bbox_head.layers.\1.\2", -} - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - # Use the mapping to rename keys - for original_key, converted_key in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - for key in list(state_dict_keys.keys()): - new_key = re.sub(original_key, converted_key, key) - if new_key != key: - state_dict_keys[new_key] = state_dict_keys.pop(key) - - return state_dict_keys - - -def read_in_q_k_v(state_dict, config, model_name): - prefix = "" - encoder_hidden_dim = config.encoder_hidden_dim - - # first: transformer encoder - for i in range(config.encoder_layers): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}encoder.encoder.{i}.layers.0.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.weight"] = in_proj_weight[ - :encoder_hidden_dim, : - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.q_proj.bias"] = in_proj_bias[:encoder_hidden_dim] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.weight"] = in_proj_weight[ - encoder_hidden_dim : 2 * encoder_hidden_dim, : - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.k_proj.bias"] = in_proj_bias[ - encoder_hidden_dim : 2 * encoder_hidden_dim - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.weight"] = in_proj_weight[ - -encoder_hidden_dim:, : - ] - state_dict[f"model.encoder.encoder.{i}.layers.0.self_attn.v_proj.bias"] = in_proj_bias[-encoder_hidden_dim:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_weight", None) - in_proj_bias = state_dict.pop(f"{prefix}decoder.decoder.layers.{i}.self_attn.in_proj_bias", None) - # next, add query, keys and values (in that order) to the state dict - if model_name in ["dfine_n_coco", "dfine_n_obj2coco_e25", "dfine_n_obj365"]: - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:128, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:128] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:384, :] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:384] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-128:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-128:] - else: - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_d_fine_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, repo_id): - """ - Copy/paste/tweak model's weights to our D-FINE structure. - """ - - # load default config - config = get_d_fine_config(model_name) - state_dict = load_original_state_dict(repo_id, model_name) - state_dict.pop("decoder.valid_mask", None) - state_dict.pop("decoder.anchors", None) - model = DFineForObjectDetection(config) - logger.info(f"Converting model {model_name}...") - - state_dict = convert_old_keys_to_new_keys(state_dict) - state_dict.pop("decoder.model.decoder.up", None) - state_dict.pop("decoder.model.decoder.reg_scale", None) - - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, config, model_name) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - for key in state_dict.copy().keys(): - if key.endswith("num_batches_tracked"): - del state_dict[key] - # for two_stage - if "bbox_embed" in key or ("class_embed" in key and "denoising_" not in key): - state_dict[key.split("model.decoder.")[-1]] = state_dict[key] - - # finally, create HuggingFace model and load state dict - model.load_state_dict(state_dict) - model.eval() - - # load image processor - image_processor = RTDetrImageProcessor() - - # prepare image - img = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize([640, 640], interpolation=transforms.InterpolationMode.BILINEAR), - transforms.ToTensor(), - ] - ) - original_pixel_values = transformations(img).unsqueeze(0) # insert batch dimension - - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - assert torch.allclose(original_pixel_values, pixel_values) - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model.to(device) - pixel_values = pixel_values.to(device) - - outputs = model(pixel_values) - - if model_name == "dfine_x_coco": - expected_slice_logits = torch.tensor( - [ - [-4.844723, -4.7293096, -4.5971327], - [-4.554266, -4.61723, -4.627926], - [-4.3934402, -4.6064143, -4.139952], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2565248, 0.5477609, 0.47644863], - [0.7690029, 0.41423926, 0.46148556], - [0.1688096, 0.19923759, 0.21118002], - ] - ) - elif model_name == "dfine_x_obj2coco": - expected_slice_logits = torch.tensor( - [ - [-4.230433, -6.6295037, -4.8339615], - [-4.085411, -6.3280816, -4.695468], - [-3.8968022, -6.336813, -4.67051], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.25707328, 0.54842496, 0.47624254], - [0.76967394, 0.41272867, 0.45970756], - [0.16882066, 0.19918433, 0.2112098], - ] - ) - elif model_name == "dfine_x_obj365": - expected_slice_logits = torch.tensor( - [ - [-6.3844957, -3.7549126, -4.6873264], - [-5.8433194, -3.4490552, -3.3228905], - [-6.5314736, -3.7856622, -4.895984], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7703046, 0.41329497, 0.45932162], - [0.16898105, 0.19876392, 0.21050783], - [0.25134972, 0.5517619, 0.4864124], - ] - ) - elif model_name == "dfine_m_coco": - expected_slice_logits = torch.tensor( - [ - [-4.5187078, -4.71708, -4.117749], - [-4.513984, -4.937715, -3.829125], - [-4.830042, -6.931682, -3.1740026], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.25851426, 0.5489963, 0.4757598], - [0.769683, 0.41411665, 0.45988125], - [0.16866133, 0.19921188, 0.21207744], - ] - ) - elif model_name == "dfine_m_obj2coco": - expected_slice_logits = torch.tensor( - [ - [-4.520666, -7.6678333, -5.739887], - [-4.5053635, -7.510611, -5.452532], - [-4.70348, -5.6098466, -5.0199957], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2567608, 0.5485795, 0.4767465], - [0.77035284, 0.41236404, 0.4580645], - [0.5498525, 0.27548885, 0.05886984], - ] - ) - elif model_name == "dfine_m_obj365": - expected_slice_logits = torch.tensor( - [ - [-5.770525, -3.1610885, -5.2807794], - [-5.7809954, -3.768266, -5.1146393], - [-6.180705, -3.7357295, -3.1651964], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2529114, 0.5526663, 0.48270613], - [0.7712474, 0.41294736, 0.457174], - [0.5497157, 0.27588123, 0.05813372], - ] - ) - elif model_name == "dfine_l_coco": - expected_slice_logits = torch.tensor( - [ - [-4.068779, -5.169955, -4.339212], - [-3.9461594, -5.0279613, -4.0161457], - [-4.218292, -6.196324, -5.175245], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2564867, 0.5489948, 0.4748876], - [0.7693534, 0.4138953, 0.4598034], - [0.16875696, 0.19875404, 0.21196914], - ] - ) - elif model_name == "dfine_l_obj365": - expected_slice_logits = torch.tensor( - [ - [-5.7953215, -3.4901116, -5.4394145], - [-5.7032104, -3.671125, -5.76121], - [-6.09466, -3.1512096, -4.285499], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7693825, 0.41265628, 0.4606362], - [0.25306237, 0.55187637, 0.4832178], - [0.16892478, 0.19880727, 0.21115331], - ] - ) - elif model_name == "dfine_l_obj2coco_e25": - expected_slice_logits = torch.tensor( - [ - [-3.6098495, -6.633563, -5.1227236], - [-3.682696, -6.9178205, -5.414557], - [-4.491674, -6.0823426, -4.5718226], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7697078, 0.41368833, 0.45879585], - [0.2573691, 0.54856044, 0.47715297], - [0.16895264, 0.19871138, 0.2115552], - ] - ) - elif model_name == "dfine_n_coco": - expected_slice_logits = torch.tensor( - [ - [-3.7827945, -5.0889463, -4.8341026], - [-5.3046904, -6.2801714, -2.9276395], - [-4.497901, -5.2670407, -6.2380104], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.73334837, 0.4270624, 0.39424777], - [0.1680235, 0.1988639, 0.21031213], - [0.25370035, 0.5534435, 0.48496848], - ] - ) - elif model_name == "dfine_s_coco": - expected_slice_logits = torch.tensor( - [ - [-3.8097816, -4.7724586, -5.994499], - [-5.2974715, -9.499067, -6.1653666], - [-5.3502765, -3.9530406, -6.3630295], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.7677696, 0.41479152, 0.46441072], - [0.16912134, 0.19869131, 0.2123824], - [0.2581653, 0.54818195, 0.47512347], - ] - ) - elif model_name == "dfine_s_obj2coco": - expected_slice_logits = torch.tensor( - [ - [-6.0208125, -7.532673, -5.0572147], - [-3.3595953, -9.057545, -6.376975], - [-4.3203554, -9.546032, -6.075504], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.16901012, 0.19883151, 0.21121952], - [0.76784194, 0.41266578, 0.46402973], - [00.2563128, 0.54797643, 0.47937632], - ] - ) - elif model_name == "dfine_s_obj365": - expected_slice_logits = torch.tensor( - [ - [-6.3807316, -4.320986, -6.4775343], - [-6.5818424, -3.5009093, -5.75824], - [-5.748005, -4.3228016, -4.003726], - ] - ) - expected_slice_boxes = torch.tensor( - [ - [0.2532072, 0.5491191, 0.48222217], - [0.76586807, 0.41175705, 0.46789962], - [0.169111, 0.19844547, 0.21069047], - ] - ) - else: - raise ValueError(f"Unknown d_fine_name: {model_name}") - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits.to(outputs.logits.device), atol=1e-3) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes.to(outputs.pred_boxes.device), atol=1e-4) - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Upload model, image processor and config to the hub - logger.info("Uploading PyTorch model and image processor to the hub...") - config.push_to_hub( - repo_id=repo_id, - commit_message="Add config from convert_d_fine_original_pytorch_checkpoint_to_hf.py", - ) - model.push_to_hub( - repo_id=repo_id, - commit_message="Add model from convert_d_fine_original_pytorch_checkpoint_to_hf.py", - ) - image_processor.push_to_hub( - repo_id=repo_id, - commit_message="Add image processor from convert_d_fine_original_pytorch_checkpoint_to_hf.py", - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name", - default="dfine_s_coco", - type=str, - help="model_name of the checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") - parser.add_argument( - "--repo_id", - type=str, - help="repo_id where the model will be pushed to.", - ) - args = parser.parse_args() - convert_d_fine_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.repo_id) diff --git a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 3d8cf3e279..0000000000 --- a/src/transformers/models/dab_detr/convert_dab_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DAB-DETR checkpoints.""" - -import argparse -import gc -import json -import re -from pathlib import Path -from typing import Optional - -import torch -from huggingface_hub import hf_hub_download - -from transformers import ConditionalDetrImageProcessor, DabDetrConfig, DabDetrForObjectDetection -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads - # for dab-DETR, also convert reference point head and query scale MLP - r"input_proj\.(bias|weight)": r"input_projection.\1", - r"refpoint_embed\.weight": r"query_refpoint_embeddings.weight", - r"class_embed\.(bias|weight)": r"class_embed.\1", - # negative lookbehind because of the overlap - r"(?= 0.9.0") - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = "Hello world! cĂ©cĂ© herlolip" - - -def convert_data2vec_checkpoint_to_pytorch( - data2vec_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool -): - """ - Copy/paste/tweak data2vec's weights to our BERT structure. - """ - data2vec_checkpoint_dir, data2vec_checkpoint_file_name = os.path.split(data2vec_checkpoint_path) - data2vec = Data2VecTextModel.from_pretrained( - data2vec_checkpoint_dir, checkpoint_file=data2vec_checkpoint_file_name - ) - data2vec.eval() # disable dropout - data2vec_model = data2vec.models[0] - data2vec_sent_encoder = data2vec_model.encoder.sentence_encoder - config = Data2VecTextConfig( - vocab_size=data2vec_sent_encoder.embed_tokens.num_embeddings, - hidden_size=data2vec_model.args.encoder_embed_dim, - num_hidden_layers=data2vec_model.args.encoder_layers, - num_attention_heads=data2vec_model.args.encoder_attention_heads, - intermediate_size=data2vec_model.args.encoder_ffn_embed_dim, - max_position_embeddings=514, - type_vocab_size=1, - layer_norm_eps=1e-5, # PyTorch default used in fairseq - ) - if classification_head: - config.num_labels = data2vec.model.classification_heads["mnli"].out_proj.weight.shape[0] - print("Our BERT config:", config) - - model = Data2VecTextForSequenceClassification(config) if classification_head else Data2VecTextForMaskedLM(config) - model.eval() - - # Now let's copy all the weights. - # Embeddings - model.data2vec_text.embeddings.word_embeddings.weight = data2vec_sent_encoder.embed_tokens.weight - model.data2vec_text.embeddings.position_embeddings.weight = data2vec_sent_encoder.embed_positions.weight - model.data2vec_text.embeddings.token_type_embeddings.weight.data = torch.zeros_like( - model.data2vec_text.embeddings.token_type_embeddings.weight - ) # just zero them out b/c data2vec doesn't use them. - model.data2vec_text.embeddings.LayerNorm.weight = data2vec_sent_encoder.layernorm_embedding.weight - model.data2vec_text.embeddings.LayerNorm.bias = data2vec_sent_encoder.layernorm_embedding.bias - - for i in range(config.num_hidden_layers): - # Encoder: start of layer - layer: BertLayer = model.data2vec_text.encoder.layer[i] - data2vec_layer: TransformerSentenceEncoderLayer = data2vec_sent_encoder.layers[i] - - # self attention - self_attn: BertSelfAttention = layer.attention.self - assert data2vec_layer.self_attn.k_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.k_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - assert data2vec_layer.self_attn.q_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.q_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - assert data2vec_layer.self_attn.v_proj.weight.data.shape == torch.Size( - (config.hidden_size, config.hidden_size) - ), ( - "Shape for data2vec_layer.self_attn.v_proj.weight.data should be" - f" {torch.Size((config.hidden_size, config.hidden_size))}" - ) - - self_attn.query.weight.data = data2vec_layer.self_attn.q_proj.weight - self_attn.query.bias.data = data2vec_layer.self_attn.q_proj.bias - self_attn.key.weight.data = data2vec_layer.self_attn.k_proj.weight - self_attn.key.bias.data = data2vec_layer.self_attn.k_proj.bias - self_attn.value.weight.data = data2vec_layer.self_attn.v_proj.weight - self_attn.value.bias.data = data2vec_layer.self_attn.v_proj.bias - - # self-attention output - self_output: BertSelfOutput = layer.attention.output - assert self_output.dense.weight.shape == data2vec_layer.self_attn.out_proj.weight.shape, ( - f"Shape for self_output.dense.weight should be {data2vec_layer.self_attn.out_proj.weight.shape}" - ) - self_output.dense.weight = data2vec_layer.self_attn.out_proj.weight - self_output.dense.bias = data2vec_layer.self_attn.out_proj.bias - self_output.LayerNorm.weight = data2vec_layer.self_attn_layer_norm.weight - self_output.LayerNorm.bias = data2vec_layer.self_attn_layer_norm.bias - - # intermediate - intermediate: BertIntermediate = layer.intermediate - assert intermediate.dense.weight.shape == data2vec_layer.fc1.weight.shape, ( - f"Shape for intermediate.dense.weight should be {data2vec_layer.fc1.weight.shape}" - ) - intermediate.dense.weight = data2vec_layer.fc1.weight - intermediate.dense.bias = data2vec_layer.fc1.bias - - # output - bert_output: BertOutput = layer.output - assert bert_output.dense.weight.shape == data2vec_layer.fc2.weight.shape, ( - f"Shape for bert_output.dense.weight should be {data2vec_layer.fc2.weight.shape}" - ) - bert_output.dense.weight = data2vec_layer.fc2.weight - bert_output.dense.bias = data2vec_layer.fc2.bias - bert_output.LayerNorm.weight = data2vec_layer.final_layer_norm.weight - bert_output.LayerNorm.bias = data2vec_layer.final_layer_norm.bias - # end of layer - - if classification_head: - model.classifier.dense.weight = data2vec.model.classification_heads["mnli"].dense.weight - model.classifier.dense.bias = data2vec.model.classification_heads["mnli"].dense.bias - model.classifier.out_proj.weight = data2vec.model.classification_heads["mnli"].out_proj.weight - model.classifier.out_proj.bias = data2vec.model.classification_heads["mnli"].out_proj.bias - else: - # LM Head - model.lm_head.dense.weight = data2vec_model.encoder.lm_head.dense.weight - model.lm_head.dense.bias = data2vec_model.encoder.lm_head.dense.bias - model.lm_head.layer_norm.weight = data2vec_model.encoder.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = data2vec_model.encoder.lm_head.layer_norm.bias - model.lm_head.decoder.weight = data2vec_model.encoder.lm_head.weight - model.lm_head.decoder.bias = data2vec_model.encoder.lm_head.bias - - # Let's check that we get the same results. - input_ids: torch.Tensor = data2vec.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 - - our_output = model(input_ids)[0] - if classification_head: - their_output = data2vec.model.classification_heads["mnli"](data2vec.extract_features(input_ids)) - else: - their_output = data2vec_model(input_ids)[0] - print(our_output.shape, their_output.shape) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 - success = torch.allclose(our_output, their_output, atol=1e-3) - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - if not success: - raise Exception("Something went wRoNg") - - pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--classification_head", action="store_true", help="Whether to convert a final classification head." - ) - args = parser.parse_args() - convert_data2vec_checkpoint_to_pytorch( - args.checkpoint_path, args.pytorch_dump_folder_path, args.classification_head - ) diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index 3f9d777351..0000000000 --- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,374 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import json - -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm.models import create_model - -from transformers import ( - BeitImageProcessor, - Data2VecVisionConfig, - Data2VecVisionForImageClassification, - Data2VecVisionModel, -) - - -def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"), - ] - ) - - if has_lm_head: - # mask token + shared relative position bias + layernorm - rename_keys.extend( - [ - ("mask_token", f"{hf_prefix}embeddings.mask_token"), - ( - "rel_pos_bias.relative_position_bias_table", - f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table", - ), - ( - "rel_pos_bias.relative_position_index", - f"{hf_prefix}encoder.relative_position_bias.relative_position_index", - ), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - elif is_semantic: - # semantic segmentation classification heads - rename_keys.extend( - [ - ("decode_head.conv_seg.weight", "decode_head.classifier.weight"), - ("decode_head.conv_seg.bias", "decode_head.classifier.bias"), - ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"), - ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"), - ("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2 - - # relative_position bias table + index - if not has_lm_head: - # each layer has its own relative position bias - table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table") - index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index") - - state_dict[ - f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table" - ] = table - state_dict[ - f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index" - ] = index - - -def get_args(): - parser = argparse.ArgumentParser( - "Convert Data2VecVision to HF for image classification and pretraining", add_help=False - ) - parser.add_argument("--hf_checkpoint_name", type=str) - parser.add_argument("--input_size", default=224, type=int, help="images input size") - parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint") - - return parser.parse_args() - - -def load_beit_model(args, is_finetuned, is_large): - def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"): - missing_keys = [] - unexpected_keys = [] - error_msgs = [] - # copy state_dict so _load_from_state_dict can modify it - metadata = getattr(state_dict, "_metadata", None) - state_dict = state_dict.copy() - if metadata is not None: - state_dict._metadata = metadata - - def load(module, prefix=""): - local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - module._load_from_state_dict( - state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs - ) - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + ".") - - load(model, prefix=prefix) - - warn_missing_keys = [] - ignore_missing_keys = [] - for key in missing_keys: - keep_flag = True - for ignore_key in ignore_missing.split("|"): - if ignore_key in key: - keep_flag = False - break - if keep_flag: - warn_missing_keys.append(key) - else: - ignore_missing_keys.append(key) - - missing_keys = warn_missing_keys - - if len(missing_keys) > 0: - print( - "Weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, missing_keys - ) - ) - if len(unexpected_keys) > 0: - print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)) - if len(ignore_missing_keys) > 0: - print( - "Ignored weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, ignore_missing_keys - ) - ) - if len(error_msgs) > 0: - print("\n".join(error_msgs)) - - model_kwargs = { - "pretrained": False, - "use_shared_rel_pos_bias": True, - "use_abs_pos_emb": False, - "init_values": 0.1, - } - - if is_finetuned: - model_kwargs.update( - { - "num_classes": 1000, - "use_mean_pooling": True, - "init_scale": 0.001, - "use_rel_pos_bias": True, - } - ) - - model = create_model( - "beit_large_patch16_224" if is_large else "beit_base_patch16_224", - **model_kwargs, - ) - patch_size = model.patch_embed.patch_size - args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1]) - checkpoint = torch.load(args.beit_checkpoint, map_location="cpu", weights_only=True) - - print(f"Load ckpt from {args.beit_checkpoint}") - checkpoint_model = None - for model_key in ("model", "module"): - if model_key in checkpoint: - checkpoint_model = checkpoint[model_key] - print(f"Load state_dict by model_key = {model_key}") - break - - all_keys = list(checkpoint_model.keys()) - for key in all_keys: - if "relative_position_index" in key: - checkpoint_model.pop(key) - - if "relative_position_bias_table" in key: - rel_pos_bias = checkpoint_model[key] - src_num_pos, num_attn_heads = rel_pos_bias.size() - dst_num_pos, _ = model.state_dict()[key].size() - dst_patch_shape = model.patch_embed.patch_shape - if dst_patch_shape[0] != dst_patch_shape[1]: - raise NotImplementedError() - - load_state_dict(model, checkpoint_model, prefix="") - - return model - - -def main(): - args = get_args() - - is_finetuned = "ft1k" in args.hf_checkpoint_name - is_large = "large" in args.hf_checkpoint_name - - if is_finetuned: - # To convert Beit's data2vec_vision to HF you need to copy - # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py - # into this folder. - import modeling_finetune # noqa: F401 - else: - # To convert Beit's data2vec_vision to HF you need to copy - # https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py - # into this folder - # IMPORTANT: Note that for now we've only converted the down-stream - # model and not the full pretrained model. This means for the integration - # test you need to add a `return x` after the following line: - # https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197 - # to make the integration test pass. - import modeling_cyclical # noqa: F401 - - # 1. Create model config - config = Data2VecVisionConfig() - if is_finetuned: - config.use_relative_position_bias = True - config.use_shared_relative_position_bias = False - config.use_mean_pooling = True - config.num_labels = 1000 - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - else: - config.use_relative_position_bias = False - config.use_shared_relative_position_bias = True - config.use_mean_pooling = False - - if is_large: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # 2. Load Beit model - orig_model = load_beit_model(args, is_finetuned, is_large) - orig_model.eval() - - # 3. Forward Beit model - image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) - image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png") - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - orig_args = (pixel_values,) if is_finetuned else (pixel_values, None) - with torch.no_grad(): - orig_model_output = orig_model(*orig_args) - - # 4. Load HF Data2VecVision model - if is_finetuned: - hf_model = Data2VecVisionForImageClassification(config) - hf_model.eval() - has_lm_head = False - hf_prefix = "data2vec_vision." - else: - hf_model = Data2VecVisionModel(config) - hf_model.eval() - has_lm_head = True - hf_prefix = "" - - rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) - state_dict = orig_model.state_dict() - for src, dest in rename_keys: - val = state_dict.pop(src) - state_dict[dest] = val - - read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head) - missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) - print("HF missing", missing_keys) - print("HF unexpected_keys", unexpected_keys) - - # 5. Forward HF Data2VecVision model - with torch.no_grad(): - hf_model_output = hf_model(pixel_values) - - hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state - - # 6. Compare - max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item() - - print(f"max_absolute_diff = {max_absolute_diff}") - success = torch.allclose(hf_output, orig_model_output, atol=1e-3) - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - if not success: - raise Exception("Something went wRoNg") - - # 7. Save - print(f"Saving to {args.hf_checkpoint_name}") - hf_model.save_pretrained(args.hf_checkpoint_name) - image_processor.save_pretrained(args.hf_checkpoint_name) - - -if __name__ == "__main__": - main() - # Run the following to convert checkpoints - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./pretrained_base.pt \ - # --hf_checkpoint_name "./data2vec-vision-base" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./finetuned_base.pt \ - # --hf_checkpoint_name "./data2vec-vision-base-ft1k" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./pretrained_large.pt \ - # --hf_checkpoint_name "./data2vec-vision-large" - # python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \ - # --beit_checkpoint ./finetuned_large.pt \ - # --hf_checkpoint_name "./data2vec-vision-large-ft1k" diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py deleted file mode 100644 index c88582eacc..0000000000 --- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py +++ /dev/null @@ -1,236 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Deformable DETR checkpoints.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DeformableDetrConfig, DeformableDetrForObjectDetection, DeformableDetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_key(orig_key): - if "backbone.0.body" in orig_key: - orig_key = orig_key.replace("backbone.0.body", "backbone.conv_encoder.model") - if "transformer" in orig_key: - orig_key = orig_key.replace("transformer.", "") - if "norm1" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm1", "self_attn_layer_norm") - else: - orig_key = orig_key.replace("norm1", "encoder_attn_layer_norm") - if "norm2" in orig_key: - if "encoder" in orig_key: - orig_key = orig_key.replace("norm2", "final_layer_norm") - else: - orig_key = orig_key.replace("norm2", "self_attn_layer_norm") - if "norm3" in orig_key: - orig_key = orig_key.replace("norm3", "final_layer_norm") - if "linear1" in orig_key: - orig_key = orig_key.replace("linear1", "fc1") - if "linear2" in orig_key: - orig_key = orig_key.replace("linear2", "fc2") - if "query_embed" in orig_key: - orig_key = orig_key.replace("query_embed", "query_position_embeddings") - if "cross_attn" in orig_key: - orig_key = orig_key.replace("cross_attn", "encoder_attn") - - return orig_key - - -def read_in_q_k_v(state_dict): - # transformer decoder self-attention layers - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deformable_detr_checkpoint( - checkpoint_path, - single_scale, - dilation, - with_box_refine, - two_stage, - pytorch_dump_folder_path, - push_to_hub, -): - """ - Copy/paste/tweak model's weights to our Deformable DETR structure. - """ - - # load default config - config = DeformableDetrConfig() - # set config attributes - if single_scale: - config.num_feature_levels = 1 - config.dilation = dilation - config.with_box_refine = with_box_refine - config.two_stage = two_stage - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - image_processor = DeformableDetrImageProcessor(format="coco_detection") - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "model." - for key in state_dict.copy().keys(): - if not key.startswith("class_embed") and not key.startswith("bbox_embed"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = DeformableDetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - # verify our conversion - outputs = model(pixel_values.to(device)) - - expected_logits = torch.tensor( - [[-9.6645, -4.3449, -5.8705], [-9.7035, -3.8504, -5.0724], [-10.5634, -5.3379, -7.5116]] - ) - expected_boxes = torch.tensor([[0.8693, 0.2289, 0.2492], [0.3150, 0.5489, 0.5845], [0.5563, 0.7580, 0.8518]]) - - if single_scale: - expected_logits = torch.tensor( - [[-9.9051, -4.2541, -6.4852], [-9.6947, -4.0854, -6.8033], [-10.0665, -5.8470, -7.7003]] - ) - expected_boxes = torch.tensor([[0.7292, 0.4991, 0.5532], [0.7959, 0.2426, 0.4236], [0.7582, 0.3518, 0.4451]]) - - if single_scale and dilation: - expected_logits = torch.tensor( - [[-8.9652, -4.1074, -5.6635], [-9.0596, -4.9447, -6.6075], [-10.1178, -4.5275, -6.2671]] - ) - expected_boxes = torch.tensor([[0.7665, 0.4130, 0.4769], [0.8364, 0.1841, 0.3391], [0.6261, 0.3895, 0.7978]]) - - if with_box_refine: - expected_logits = torch.tensor( - [[-8.8895, -5.4187, -6.8153], [-8.4706, -6.1668, -7.6184], [-9.0042, -5.5359, -6.9141]] - ) - expected_boxes = torch.tensor([[0.7828, 0.2208, 0.4323], [0.0892, 0.5996, 0.1319], [0.5524, 0.6389, 0.8914]]) - - if with_box_refine and two_stage: - expected_logits = torch.tensor( - [[-6.7108, -4.3213, -6.3777], [-8.9014, -6.1799, -6.7240], [-6.9315, -4.4735, -6.2298]] - ) - expected_boxes = torch.tensor([[0.2583, 0.5499, 0.4683], [0.7652, 0.9068, 0.4882], [0.5490, 0.2763, 0.0564]]) - - print("Logits:", outputs.logits[0, :3, :3]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - - print("Everything ok!") - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - model_name = "deformable-detr" - model_name += "-single-scale" if single_scale else "" - model_name += "-dc5" if dilation else "" - model_name += "-with-box-refine" if with_box_refine else "" - model_name += "-two-stage" if two_stage else "" - print("Pushing model to hub...") - model.push_to_hub(repo_path_or_name=model_name, organization="nielsr", commit_message="Add model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - type=str, - default="/home/niels/checkpoints/deformable_detr/r50_deformable_detr-checkpoint.pth", - help="Path to Pytorch checkpoint (.pth file) you'd like to convert.", - ) - parser.add_argument("--single_scale", action="store_true", help="Whether to set config.num_features_levels = 1.") - parser.add_argument("--dilation", action="store_true", help="Whether to set config.dilation=True.") - parser.add_argument("--with_box_refine", action="store_true", help="Whether to set config.with_box_refine=True.") - parser.add_argument("--two_stage", action="store_true", help="Whether to set config.two_stage=True.") - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - convert_deformable_detr_checkpoint( - args.checkpoint_path, - args.single_scale, - args.dilation, - args.with_box_refine, - args.two_stage, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py deleted file mode 100644 index e7bf3e7a12..0000000000 --- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py +++ /dev/null @@ -1,218 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DeiT distilled checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import timm -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DeiTConfig, DeiTForImageClassificationWithTeacher, DeiTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, base_model=False): - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"deit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"deit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"deit.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"deit.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"deit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"deit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"deit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"deit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"deit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"deit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - ("cls_token", "deit.embeddings.cls_token"), - ("dist_token", "deit.embeddings.distillation_token"), - ("patch_embed.proj.weight", "deit.embeddings.patch_embeddings.projection.weight"), - ("patch_embed.proj.bias", "deit.embeddings.patch_embeddings.projection.bias"), - ("pos_embed", "deit.embeddings.position_embeddings"), - ] - ) - - if base_model: - # layernorm + pooler - rename_keys.extend( - [ - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ("pre_logits.fc.weight", "pooler.dense.weight"), - ("pre_logits.fc.bias", "pooler.dense.bias"), - ] - ) - - # if just the base model, we should remove "deit" from all keys that start with "deit" - rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("deit") else pair for pair in rename_keys] - else: - # layernorm + classification heads - rename_keys.extend( - [ - ("norm.weight", "deit.layernorm.weight"), - ("norm.bias", "deit.layernorm.bias"), - ("head.weight", "cls_classifier.weight"), - ("head.bias", "cls_classifier.bias"), - ("head_dist.weight", "distillation_classifier.weight"), - ("head_dist.bias", "distillation_classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, base_model=False): - for i in range(config.num_hidden_layers): - if base_model: - prefix = "" - else: - prefix = "deit." - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_deit_checkpoint(deit_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our DeiT structure. - """ - - # define default DeiT configuration - config = DeiTConfig() - # all deit models have fine-tuned heads - base_model = False - # dataset (fine-tuned on ImageNet 2012), patch_size and image_size - config.num_labels = 1000 - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - config.patch_size = int(deit_name[-6:-4]) - config.image_size = int(deit_name[-3:]) - # size of the architecture - if deit_name[9:].startswith("tiny"): - config.hidden_size = 192 - config.intermediate_size = 768 - config.num_hidden_layers = 12 - config.num_attention_heads = 3 - elif deit_name[9:].startswith("small"): - config.hidden_size = 384 - config.intermediate_size = 1536 - config.num_hidden_layers = 12 - config.num_attention_heads = 6 - if deit_name[9:].startswith("base"): - pass - elif deit_name[4:].startswith("large"): - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # load original model from timm - timm_model = timm.create_model(deit_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = timm_model.state_dict() - rename_keys = create_rename_keys(config, base_model) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, base_model) - - # load HuggingFace model - model = DeiTForImageClassificationWithTeacher(config).eval() - model.load_state_dict(state_dict) - - # Check outputs on an image, prepared by DeiTImageProcessor - size = int( - (256 / 224) * config.image_size - ) # to maintain same ratio w.r.t. 224 images, see https://github.com/facebookresearch/deit/blob/ab5715372db8c6cad5740714b2216d55aeae052e/datasets.py#L103 - image_processor = DeiTImageProcessor(size=size, crop_size=config.image_size) - encoding = image_processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values) - - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {deit_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--deit_name", - default="vit_deit_base_distilled_patch16_224", - type=str, - help="Name of the DeiT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - - args = parser.parse_args() - convert_deit_checkpoint(args.deit_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py deleted file mode 100644 index 1f3d675e09..0000000000 --- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py +++ /dev/null @@ -1,318 +0,0 @@ -# coding=utf-8 -# Copyright 2020, The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Bort checkpoint.""" - -import argparse -import os - -import gluonnlp as nlp -import mxnet as mx -import numpy as np -import torch -from gluonnlp.base import get_home_dir -from gluonnlp.model.bert import BERTEncoder -from gluonnlp.model.utils import _load_vocab -from gluonnlp.vocab import Vocab -from packaging import version -from torch import nn - -from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer -from transformers.models.bert.modeling_bert import ( - BertIntermediate, - BertLayer, - BertOutput, - BertSelfAttention, - BertSelfOutput, -) -from transformers.utils import logging - - -if version.parse(nlp.__version__) != version.parse("0.8.3"): - raise Exception("requires gluonnlp == 0.8.3") - -if version.parse(mx.__version__) != version.parse("1.5.0"): - raise Exception("requires mxnet == 1.5.0") - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_TEXT = "The Nymphenburg Palace is a beautiful palace in Munich!" - - -def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str): - """ - Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure- - """ - - # Original Bort configuration - bort_4_8_768_1024_hparams = { - "attention_cell": "multi_head", - "num_layers": 4, - "units": 1024, - "hidden_size": 768, - "max_length": 512, - "num_heads": 8, - "scaled": True, - "dropout": 0.1, - "use_residual": True, - "embed_size": 1024, - "embed_dropout": 0.1, - "word_embed": None, - "layer_norm_eps": 1e-5, - "token_type_vocab_size": 2, - } - - predefined_args = bort_4_8_768_1024_hparams - - # Let's construct the original Bort model here - # Taken from official BERT implementation, see: - # https://github.com/alexa/bort/blob/master/bort/bort.py - encoder = BERTEncoder( - attention_cell=predefined_args["attention_cell"], - num_layers=predefined_args["num_layers"], - units=predefined_args["units"], - hidden_size=predefined_args["hidden_size"], - max_length=predefined_args["max_length"], - num_heads=predefined_args["num_heads"], - scaled=predefined_args["scaled"], - dropout=predefined_args["dropout"], - output_attention=False, - output_all_encodings=False, - use_residual=predefined_args["use_residual"], - activation=predefined_args.get("activation", "gelu"), - layer_norm_eps=predefined_args.get("layer_norm_eps", None), - ) - - # Vocab information needs to be fetched first - # It's the same as RoBERTa, so RobertaTokenizer can be used later - vocab_name = "openwebtext_ccnews_stories_books_cased" - - # Specify download folder to Gluonnlp's vocab - gluon_cache_dir = os.path.join(get_home_dir(), "models") - bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab) - - original_bort = nlp.model.BERTModel( - encoder, - len(bort_vocab), - units=predefined_args["units"], - embed_size=predefined_args["embed_size"], - embed_dropout=predefined_args["embed_dropout"], - word_embed=predefined_args["word_embed"], - use_pooler=False, - use_token_type_embed=False, - token_type_vocab_size=predefined_args["token_type_vocab_size"], - use_classifier=False, - use_decoder=False, - ) - - original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True) - params = original_bort._collect_params_with_prefix() - - # Build our config đŸ€— - hf_bort_config_json = { - "architectures": ["BertForMaskedLM"], - "attention_probs_dropout_prob": predefined_args["dropout"], - "hidden_act": "gelu", - "hidden_dropout_prob": predefined_args["dropout"], - "hidden_size": predefined_args["embed_size"], - "initializer_range": 0.02, - "intermediate_size": predefined_args["hidden_size"], - "layer_norm_eps": predefined_args["layer_norm_eps"], - "max_position_embeddings": predefined_args["max_length"], - "model_type": "bort", - "num_attention_heads": predefined_args["num_heads"], - "num_hidden_layers": predefined_args["num_layers"], - "pad_token_id": 1, # 2 = BERT, 1 = RoBERTa - "type_vocab_size": 1, # 2 = BERT, 1 = RoBERTa - "vocab_size": len(bort_vocab), - } - - hf_bort_config = BertConfig.from_dict(hf_bort_config_json) - hf_bort_model = BertForMaskedLM(hf_bort_config) - hf_bort_model.eval() - - # Parameter mapping table (Gluonnlp to Transformers) - # * denotes layer index - # - # | Gluon Parameter | Transformers Parameter - # | -------------------------------------------------------------- | ---------------------- - # | `encoder.layer_norm.beta` | `bert.embeddings.LayerNorm.bias` - # | `encoder.layer_norm.gamma` | `bert.embeddings.LayerNorm.weight` - # | `encoder.position_weight` | `bert.embeddings.position_embeddings.weight` - # | `word_embed.0.weight` | `bert.embeddings.word_embeddings.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_key.bias` | `bert.encoder.layer.*.attention.self.key.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_key.weight` | `bert.encoder.layer.*.attention.self.key.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_query.bias` | `bert.encoder.layer.*.attention.self.query.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight` - # | `encoder.transformer_cells.*.attention_cell.proj_value.bias` | `bert.encoder.layer.*.attention.self.value.bias` - # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight` - # | `encoder.transformer_cells.*.ffn.ffn_2.bias` | `bert.encoder.layer.*.attention.output.dense.bias` - # | `encoder.transformer_cells.*.ffn.ffn_2.weight` | `bert.encoder.layer.*.attention.output.dense.weight` - # | `encoder.transformer_cells.*.layer_norm.beta` | `bert.encoder.layer.*.attention.output.LayerNorm.bias` - # | `encoder.transformer_cells.*.layer_norm.gamma` | `bert.encoder.layer.*.attention.output.LayerNorm.weight` - # | `encoder.transformer_cells.*.ffn.ffn_1.bias` | `bert.encoder.layer.*.intermediate.dense.bias` - # | `encoder.transformer_cells.*.ffn.ffn_1.weight` | `bert.encoder.layer.*.intermediate.dense.weight` - # | `encoder.transformer_cells.*.ffn.layer_norm.beta` | `bert.encoder.layer.*.output.LayerNorm.bias` - # | `encoder.transformer_cells.*.ffn.layer_norm.gamma` | `bert.encoder.layer.*.output.LayerNorm.weight` - # | `encoder.transformer_cells.*.proj.bias` | `bert.encoder.layer.*.output.dense.bias` - # | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight` - - # Helper function to convert MXNET Arrays to PyTorch - def to_torch(mx_array) -> nn.Parameter: - return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy())) - - # Check param shapes and map new HF param back - def check_and_map_params(hf_param, gluon_param): - shape_hf = hf_param.shape - - gluon_param = to_torch(params[gluon_param]) - shape_gluon = gluon_param.shape - - assert shape_hf == shape_gluon, ( - f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers" - ) - - return gluon_param - - hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params( - hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight" - ) - hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params( - hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight" - ) - hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params( - hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta" - ) - hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params( - hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma" - ) - - # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them) - hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like( - hf_bort_model.bert.embeddings.token_type_embeddings.weight.data - ) - - for i in range(hf_bort_config.num_hidden_layers): - layer: BertLayer = hf_bort_model.bert.encoder.layer[i] - - # self attention - self_attn: BertSelfAttention = layer.attention.self - - self_attn.key.bias.data = check_and_map_params( - self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias" - ) - - self_attn.key.weight.data = check_and_map_params( - self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight" - ) - self_attn.query.bias.data = check_and_map_params( - self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias" - ) - self_attn.query.weight.data = check_and_map_params( - self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight" - ) - self_attn.value.bias.data = check_and_map_params( - self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias" - ) - self_attn.value.weight.data = check_and_map_params( - self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight" - ) - - # self attention output - self_output: BertSelfOutput = layer.attention.output - - self_output.dense.bias = check_and_map_params( - self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias" - ) - self_output.dense.weight = check_and_map_params( - self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight" - ) - self_output.LayerNorm.bias = check_and_map_params( - self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta" - ) - self_output.LayerNorm.weight = check_and_map_params( - self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma" - ) - - # intermediate - intermediate: BertIntermediate = layer.intermediate - - intermediate.dense.bias = check_and_map_params( - intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias" - ) - intermediate.dense.weight = check_and_map_params( - intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight" - ) - - # output - bert_output: BertOutput = layer.output - - bert_output.dense.bias = check_and_map_params( - bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias" - ) - bert_output.dense.weight = check_and_map_params( - bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight" - ) - bert_output.LayerNorm.bias = check_and_map_params( - bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta" - ) - bert_output.LayerNorm.weight = check_and_map_params( - bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma" - ) - - # Save space and energy 🎄 - hf_bort_model.half() - - # Compare output of both models - tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base") - - input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"] - - # Get gluon output - gluon_input_ids = mx.nd.array([input_ids]) - output_gluon = original_bort(inputs=gluon_input_ids, token_types=[]) - - # Get Transformer output (save and reload model again) - hf_bort_model.save_pretrained(pytorch_dump_folder_path) - hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path) - hf_bort_model.eval() - - input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt") - output_hf = hf_bort_model(**input_ids)[0] - - gluon_layer = output_gluon[0].asnumpy() - hf_layer = output_hf[0].detach().numpy() - - max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item() - success = np.allclose(gluon_layer, hf_layer, atol=1e-3) - - if success: - print("✔ Both model do output the same tensors") - else: - print("❌ Both model do **NOT** output the same tensors") - print("Absolute difference is:", max_absolute_diff) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--bort_checkpoint_path", default=None, type=str, required=True, help="Path the official Bort params file." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_bort_checkpoint_to_pytorch(args.bort_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py deleted file mode 100644 index 6436451190..0000000000 --- a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py +++ /dev/null @@ -1,319 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETA checkpoints from the original repository. - -URL: https://github.com/jozhang97/DETA/tree/master""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_deta_config(): - config = DetaConfig( - num_queries=900, - encoder_ffn_dim=2048, - decoder_ffn_dim=2048, - num_feature_levels=5, - assign_first_stage=True, - with_box_refine=True, - two_stage=True, - ) - - # set labels - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.conv1.weight", "model.backbone.model.embedder.embedder.convolution.weight")) - rename_keys.append(("backbone.0.body.bn1.weight", "model.backbone.model.embedder.embedder.normalization.weight")) - rename_keys.append(("backbone.0.body.bn1.bias", "model.backbone.model.embedder.embedder.normalization.bias")) - rename_keys.append(("backbone.0.body.bn1.running_mean", "model.backbone.model.embedder.embedder.normalization.running_mean")) - rename_keys.append(("backbone.0.body.bn1.running_var", "model.backbone.model.embedder.embedder.normalization.running_var")) - # stages - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - # shortcut - if layer_idx == 0: - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", - ) - ) - # 3 convs - for i in range(3): - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var", - f"model.backbone.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", - ) - ) - # transformer encoder - for i in range(config.encoder_layers): - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias")) - - # transformer decoder - for i in range(config.decoder_layers): - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")) - - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_decoder_q_k_v(state_dict, config): - # transformer decoder self-attention layers - hidden_size = config.d_model - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DETA structure. - """ - - # load config - config = get_deta_config() - - # load original state dict - if model_name == "deta-resnet-50": - filename = "adet_checkpoint0011.pth" - elif model_name == "deta-resnet-50-24-epochs": - filename = "adet_2x_checkpoint0023.pth" - else: - raise ValueError(f"Model name {model_name} not supported") - checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename=filename) - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_decoder_q_k_v(state_dict, config) - - # fix some prefixes - for key in state_dict.copy().keys(): - if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer.decoder", "model.decoder")] = val - if "input_proj" in key: - val = state_dict.pop(key) - state_dict["model." + key] = val - if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer", "model")] = val - - # finally, create HuggingFace model and load state dict - model = DetaForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - # load image processor - processor = DetaImageProcessor(format="coco_detection") - - # verify our conversion on image - img = prepare_img() - encoding = processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values.to(device)) - - # verify logits - if model_name == "deta-resnet-50": - expected_logits = torch.tensor( - [[-7.3978, -2.5406, -4.1668], [-8.2684, -3.9933, -3.8096], [-7.0515, -3.7973, -5.8516]] - ) - expected_boxes = torch.tensor([[0.5043, 0.4973, 0.9998], [0.2542, 0.5489, 0.4748], [0.5490, 0.2765, 0.0570]]) - elif model_name == "deta-resnet-50-24-epochs": - expected_logits = torch.tensor( - [[-7.1688, -2.4857, -4.8669], [-7.8630, -3.8154, -4.2674], [-7.2730, -4.1865, -5.5323]] - ) - expected_boxes = torch.tensor([[0.5021, 0.4971, 0.9994], [0.2546, 0.5486, 0.4731], [0.1686, 0.1986, 0.2142]]) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - print("Everything ok!") - - if pytorch_dump_folder_path: - # Save model and processor - logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(f"jozhang97/{model_name}") - processor.push_to_hub(f"jozhang97/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - type=str, - default="deta-resnet-50", - choices=["deta-resnet-50", "deta-resnet-50-24-epochs"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py deleted file mode 100644 index c2e1ae6001..0000000000 --- a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py +++ /dev/null @@ -1,326 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETA checkpoints from the original repository. - -URL: https://github.com/jozhang97/DETA/tree/master""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_deta_config(model_name): - backbone_config = SwinConfig( - embed_dim=192, - depths=(2, 2, 18, 2), - num_heads=(6, 12, 24, 48), - window_size=12, - out_features=["stage2", "stage3", "stage4"], - ) - - config = DetaConfig( - backbone_config=backbone_config, - num_queries=900, - encoder_ffn_dim=2048, - decoder_ffn_dim=2048, - num_feature_levels=5, - assign_first_stage=True, - with_box_refine=True, - two_stage=True, - ) - - # set labels - repo_id = "huggingface/label-files" - if "o365" in model_name: - num_labels = 366 - filename = "object365-id2label.json" - else: - num_labels = 91 - filename = "coco-detection-id2label.json" - - config.num_labels = num_labels - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.patch_embed.proj.weight", "model.backbone.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.body.patch_embed.proj.bias", "model.backbone.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.body.patch_embed.norm.weight", "model.backbone.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.body.patch_embed.norm.bias", "model.backbone.model.embeddings.norm.bias")) - # stages - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_before.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_bias_table", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_bias_table")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.relative_position_index", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.relative_position_index")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.attn.proj.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.norm2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.layernorm_after.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc1.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.weight", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.blocks.{j}.mlp.fc2.bias", f"model.backbone.model.encoder.layers.{i}.blocks.{j}.output.dense.bias")) - - if i < 3: - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.reduction.weight", f"model.backbone.model.encoder.layers.{i}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.weight", f"model.backbone.model.encoder.layers.{i}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.body.layers.{i}.downsample.norm.bias", f"model.backbone.model.encoder.layers.{i}.downsample.norm.bias")) - - rename_keys.append(("backbone.0.body.norm1.weight", "model.backbone.model.hidden_states_norms.stage2.weight")) - rename_keys.append(("backbone.0.body.norm1.bias", "model.backbone.model.hidden_states_norms.stage2.bias")) - rename_keys.append(("backbone.0.body.norm2.weight", "model.backbone.model.hidden_states_norms.stage3.weight")) - rename_keys.append(("backbone.0.body.norm2.bias", "model.backbone.model.hidden_states_norms.stage3.bias")) - rename_keys.append(("backbone.0.body.norm3.weight", "model.backbone.model.hidden_states_norms.stage4.weight")) - rename_keys.append(("backbone.0.body.norm3.bias", "model.backbone.model.hidden_states_norms.stage4.bias")) - - # transformer encoder - for i in range(config.encoder_layers): - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.weight", f"model.encoder.layers.{i}.self_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.sampling_offsets.bias", f"model.encoder.layers.{i}.self_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.weight", f"model.encoder.layers.{i}.self_attn.attention_weights.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.attention_weights.bias", f"model.encoder.layers.{i}.self_attn.attention_weights.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.weight", f"model.encoder.layers.{i}.self_attn.value_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.value_proj.bias", f"model.encoder.layers.{i}.self_attn.value_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.weight", f"model.encoder.layers.{i}.self_attn.output_proj.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.self_attn.output_proj.bias", f"model.encoder.layers.{i}.self_attn.output_proj.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.weight", f"model.encoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"model.encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"model.encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"model.encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"model.encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"model.encoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"model.encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"model.encoder.layers.{i}.final_layer_norm.bias")) - - # transformer decoder - for i in range(config.decoder_layers): - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.weight", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.sampling_offsets.bias", f"model.decoder.layers.{i}.encoder_attn.sampling_offsets.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.weight", f"model.decoder.layers.{i}.encoder_attn.attention_weights.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.attention_weights.bias", f"model.decoder.layers.{i}.encoder_attn.attention_weights.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.weight", f"model.decoder.layers.{i}.encoder_attn.value_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.value_proj.bias", f"model.decoder.layers.{i}.encoder_attn.value_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.weight", f"model.decoder.layers.{i}.encoder_attn.output_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.cross_attn.output_proj.bias", f"model.decoder.layers.{i}.encoder_attn.output_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.weight", f"model.decoder.layers.{i}.encoder_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"model.decoder.layers.{i}.encoder_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"model.decoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"model.decoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.weight", f"model.decoder.layers.{i}.self_attn_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm2.bias", f"model.decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"model.decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"model.decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"model.decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"model.decoder.layers.{i}.fc2.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"model.decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"model.decoder.layers.{i}.final_layer_norm.bias")) - - # fmt: on - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_swin_q_k_v(state_dict, backbone_config): - num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))] - for i in range(len(backbone_config.depths)): - dim = num_features[i] - for j in range(backbone_config.depths[i]): - # fmt: off - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"backbone.0.body.layers.{i}.blocks.{j}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[ - dim : dim * 2 - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[ - -dim :, : - ] - state_dict[f"model.backbone.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :] - # fmt: on - - -def read_in_decoder_q_k_v(state_dict, config): - # transformer decoder self-attention layers - hidden_size = config.d_model - for i in range(config.decoder_layers): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:hidden_size] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_deta_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DETA structure. - """ - - # load config - config = get_deta_config(model_name) - - # load original state dict - if model_name == "deta-swin-large": - checkpoint_path = hf_hub_download(repo_id="nielsr/deta-checkpoints", filename="adet_swin_ft.pth") - elif model_name == "deta-swin-large-o365": - checkpoint_path = hf_hub_download(repo_id="jozhang97/deta-swin-l-o365", filename="deta_swin_pt_o365.pth") - else: - raise ValueError(f"Model name {model_name} not supported") - - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - - # original state dict - for name, param in state_dict.items(): - print(name, param.shape) - - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_swin_q_k_v(state_dict, config.backbone_config) - read_in_decoder_q_k_v(state_dict, config) - - # fix some prefixes - for key in state_dict.copy().keys(): - if "transformer.decoder.class_embed" in key or "transformer.decoder.bbox_embed" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer.decoder", "model.decoder")] = val - if "input_proj" in key: - val = state_dict.pop(key) - state_dict["model." + key] = val - if "level_embed" in key or "pos_trans" in key or "pix_trans" in key or "enc_output" in key: - val = state_dict.pop(key) - state_dict[key.replace("transformer", "model")] = val - - # finally, create HuggingFace model and load state dict - model = DetaForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - device = "cuda" if torch.cuda.is_available() else "cpu" - model.to(device) - - # load image processor - processor = DetaImageProcessor(format="coco_detection") - - # verify our conversion on image - img = prepare_img() - encoding = processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - outputs = model(pixel_values.to(device)) - - # verify logits - print("Logits:", outputs.logits[0, :3, :3]) - print("Boxes:", outputs.pred_boxes[0, :3, :3]) - if model_name == "deta-swin-large": - expected_logits = torch.tensor( - [[-7.6308, -2.8485, -5.3737], [-7.2037, -4.5505, -4.8027], [-7.2943, -4.2611, -4.6617]] - ) - expected_boxes = torch.tensor([[0.4987, 0.4969, 0.9999], [0.2549, 0.5498, 0.4805], [0.5498, 0.2757, 0.0569]]) - elif model_name == "deta-swin-large-o365": - expected_logits = torch.tensor( - [[-8.0122, -3.5720, -4.9717], [-8.1547, -3.6886, -4.6389], [-7.6610, -3.6194, -5.0134]] - ) - expected_boxes = torch.tensor([[0.2523, 0.5549, 0.4881], [0.7715, 0.4149, 0.4601], [0.5503, 0.2753, 0.0575]]) - assert torch.allclose(outputs.logits[0, :3, :3], expected_logits.to(device), atol=1e-4) - assert torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes.to(device), atol=1e-4) - print("Everything ok!") - - if pytorch_dump_folder_path: - # Save model and processor - logger.info(f"Saving PyTorch model and processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - # Push to hub - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(f"jozhang97/{model_name}") - processor.push_to_hub(f"jozhang97/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - type=str, - default="deta-swin-large", - choices=["deta-swin-large", "deta-swin-large-o365"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the folder to output PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - convert_deta_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 80f16881b5..0000000000 --- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert EfficientFormer checkpoints from the original repository. - -URL: https://github.com/snap-research/EfficientFormer -""" - -import argparse -import re -from pathlib import Path - -import requests -import torch -from PIL import Image -from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor - -from transformers import ( - EfficientFormerConfig, - EfficientFormerForImageClassificationWithTeacher, - EfficientFormerImageProcessor, -) -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling - - -def rename_key(old_name, num_meta4D_last_stage): - new_name = old_name - - if "patch_embed" in old_name: - _, layer, param = old_name.split(".") - - if layer == "0": - new_name = old_name.replace("0", "convolution1") - elif layer == "1": - new_name = old_name.replace("1", "batchnorm_before") - elif layer == "3": - new_name = old_name.replace("3", "convolution2") - else: - new_name = old_name.replace("4", "batchnorm_after") - - if "network" in old_name and re.search(r"\d\.\d", old_name): - two_digit_num = r"\b\d{2}\b" - if bool(re.search(two_digit_num, old_name)): - match = re.search(r"\d\.\d\d.", old_name).group() - else: - match = re.search(r"\d\.\d.", old_name).group() - if int(match[0]) < 6: - trimmed_name = old_name.replace(match, "") - trimmed_name = trimmed_name.replace("network", match[0] + ".meta4D_layers.blocks." + match[2:-1]) - new_name = "intermediate_stages." + trimmed_name - else: - trimmed_name = old_name.replace(match, "") - if int(match[2]) < num_meta4D_last_stage: - trimmed_name = trimmed_name.replace("network", "meta4D_layers.blocks." + match[2]) - else: - layer_index = str(int(match[2]) - num_meta4D_last_stage) - trimmed_name = trimmed_name.replace("network", "meta3D_layers.blocks." + layer_index) - if "norm1" in old_name: - trimmed_name = trimmed_name.replace("norm1", "layernorm1") - elif "norm2" in old_name: - trimmed_name = trimmed_name.replace("norm2", "layernorm2") - elif "fc1" in old_name: - trimmed_name = trimmed_name.replace("fc1", "linear_in") - elif "fc2" in old_name: - trimmed_name = trimmed_name.replace("fc2", "linear_out") - - new_name = "last_stage." + trimmed_name - - elif "network" in old_name and re.search(r".\d.", old_name): - new_name = old_name.replace("network", "intermediate_stages") - - if "fc" in new_name: - new_name = new_name.replace("fc", "convolution") - elif ("norm1" in new_name) and ("layernorm1" not in new_name): - new_name = new_name.replace("norm1", "batchnorm_before") - elif ("norm2" in new_name) and ("layernorm2" not in new_name): - new_name = new_name.replace("norm2", "batchnorm_after") - if "proj" in new_name: - new_name = new_name.replace("proj", "projection") - if "dist_head" in new_name: - new_name = new_name.replace("dist_head", "distillation_classifier") - elif "head" in new_name: - new_name = new_name.replace("head", "classifier") - elif "patch_embed" in new_name: - new_name = "efficientformer." + new_name - elif new_name == "norm.weight" or new_name == "norm.bias": - new_name = new_name.replace("norm", "layernorm") - new_name = "efficientformer." + new_name - else: - new_name = "efficientformer.encoder." + new_name - - return new_name - - -def convert_torch_checkpoint(checkpoint, num_meta4D_last_stage): - for key in checkpoint.copy().keys(): - val = checkpoint.pop(key) - checkpoint[rename_key(key, num_meta4D_last_stage)] = val - - return checkpoint - - -# We will verify our results on a COCO image -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -def convert_efficientformer_checkpoint( - checkpoint_path: Path, efficientformer_config_file: Path, pytorch_dump_path: Path, push_to_hub: bool -): - orig_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - config = EfficientFormerConfig.from_json_file(efficientformer_config_file) - model = EfficientFormerForImageClassificationWithTeacher(config) - model_name = "_".join(checkpoint_path.split("/")[-1].split(".")[0].split("_")[:-1]) - - num_meta4D_last_stage = config.depths[-1] - config.num_meta3d_blocks + 1 - new_state_dict = convert_torch_checkpoint(orig_state_dict, num_meta4D_last_stage) - - model.load_state_dict(new_state_dict) - model.eval() - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - # prepare image - image = prepare_img() - image_size = 256 - crop_size = 224 - processor = EfficientFormerImageProcessor( - size={"shortest_edge": image_size}, - crop_size={"height": crop_size, "width": crop_size}, - resample=pillow_resamplings["bicubic"], - ) - pixel_values = processor(images=image, return_tensors="pt").pixel_values - - # original processing pipeline - image_transforms = Compose( - [ - Resize(image_size, interpolation=pillow_resamplings["bicubic"]), - CenterCrop(crop_size), - ToTensor(), - Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ] - ) - original_pixel_values = image_transforms(image).unsqueeze(0) - - assert torch.allclose(original_pixel_values, pixel_values) - - outputs = model(pixel_values) - logits = outputs.logits - - expected_shape = (1, 1000) - - if "l1" in model_name: - expected_logits = torch.Tensor( - [-0.1312, 0.4353, -1.0499, -0.5124, 0.4183, -0.6793, -1.3777, -0.0893, -0.7358, -2.4328] - ) - assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - elif "l3" in model_name: - expected_logits = torch.Tensor( - [-1.3150, -1.5456, -1.2556, -0.8496, -0.7127, -0.7897, -0.9728, -0.3052, 0.3751, -0.3127] - ) - assert torch.allclose(logits[0, :10], expected_logits, atol=1e-3) - assert logits.shape == expected_shape - elif "l7" in model_name: - expected_logits = torch.Tensor( - [-1.0283, -1.4131, -0.5644, -1.3115, -0.5785, -1.2049, -0.7528, 0.1992, -0.3822, -0.0878] - ) - assert logits.shape == expected_shape - else: - raise ValueError( - f"Unknown model checkpoint: {checkpoint_path}. Supported version of efficientformer are l1, l3 and l7" - ) - - # Save Checkpoints - Path(pytorch_dump_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_path) - print(f"Checkpoint successfully converted. Model saved at {pytorch_dump_path}") - processor.save_pretrained(pytorch_dump_path) - print(f"Processor successfully saved at {pytorch_dump_path}") - - if push_to_hub: - print("Pushing model to the hub...") - - model.push_to_hub( - repo_id=f"Bearnardd/{pytorch_dump_path}", - commit_message="Add model", - use_temp_dir=True, - ) - processor.push_to_hub( - repo_id=f"Bearnardd/{pytorch_dump_path}", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--pytorch_model_path", - default=None, - type=str, - required=True, - help="Path to EfficientFormer pytorch checkpoint.", - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The json file for EfficientFormer model config.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - parser.add_argument( - "--no-push_to_hub", - dest="push_to_hub", - action="store_false", - help="Do not push model and image processor to the hub", - ) - parser.set_defaults(push_to_hub=True) - - args = parser.parse_args() - convert_efficientformer_checkpoint( - checkpoint_path=args.pytorch_model_path, - efficientformer_config_file=args.config_file, - pytorch_dump_path=args.pytorch_dump_path, - push_to_hub=args.push_to_hub, - ) diff --git a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 8aa927d821..0000000000 --- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,181 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Convert GPTSANJapanese checkpoints from the original repository to pytorch model.""" - -import argparse -import json -import os -from collections import OrderedDict - -import numpy as np -import tensorflow as tf -import torch - - -def convert_tf_gptsan_to_pt(args): - parameter_file = os.path.join(args.tf_model_dir, "parameters.json") - params = json.loads(open(parameter_file).read()) - if not params: - raise ValueError( - f"It seems that the json file at {parameter_file} is empty. Make sure you have a correct json file." - ) - if not args.output.endswith(".pt"): - args.output = args.output + ".pt" - new_state = OrderedDict() - with tf.device("/CPU:0"): - reader = tf.train.load_checkpoint(args.tf_model_dir) - shapes = reader.get_variable_to_shape_map() - for key_name in shapes.keys(): - vnp = reader.get_tensor(key_name).astype(np.float16) - if key_name.endswith("/adam_m") or key_name.endswith("/adam_v"): - continue - if key_name.startswith("pasts/"): - if key_name.startswith("pasts/mlp"): - player = int(key_name[9]) - elif key_name.startswith("pasts/out"): - player = 8 - name = "model.sqout.%d.weight" % (player * 2) # enter to nn.Sequential with Tanh, so 2 at a time - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/moe"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/switch_gating/kernel"): - name = "model.blocks.%d.feed_forward.mlp.router.classifier.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/softmlp/kernel"): - name = "model.blocks.%d.feed_forward.soft_bypass_mlp.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/wo/kernel") or key_name.endswith("/wi/kernel"): - nlayer = key_name[-9:-7] - for i in range(16): - name = "model.blocks.%d.feed_forward.mlp.experts.expert_%d.%s.weight" % (player, i, nlayer) - state = ( - vnp[i].transpose([1, 0]).copy() - ) # In Mesh-Tensorflow, it is one array, so it is divided - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/mlp"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/p1/kernel"): - name = "model.blocks.%d.feed_forward.mlp.wi.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p1/bias"): - name = "model.blocks.%d.feed_forward.mlp.wi.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p2/kernel"): - name = "model.blocks.%d.feed_forward.mlp.wo.weight" % player - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.endswith("/p2/bias"): - name = "model.blocks.%d.feed_forward.mlp.wo.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/ln"): - player = int(key_name[8:].split("/")[0]) - if key_name.endswith("/b"): - name = "model.blocks.%d.feed_forward.norm.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/g"): - name = "model.blocks.%d.feed_forward.norm.weight" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/att"): - player = int(key_name[9:].split("/")[0]) - if key_name.endswith("/qkv/kernel"): - state = vnp.copy() # Compute same dimension as Mesh-tensorflow using einsum - state_q = state[:, 0, :, :] - state_k = state[:, 1, :, :] - state_v = state[:, 2, :, :] - state_q = ( - state_q.reshape([state_q.shape[0], state_q.shape[1] * state_q.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - state_k = ( - state_k.reshape([state_k.shape[0], state_k.shape[1] * state_k.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - state_v = ( - state_v.reshape([state_v.shape[0], state_v.shape[1] * state_v.shape[2]]) - .transpose([1, 0]) - .copy() - ) # Mesh-Tensorflow is a diagonal matrix - name = "model.blocks.%d.self_attn.self_attn.q_proj.weight" % player - new_state[name] = torch.tensor(state_q) - name = "model.blocks.%d.self_attn.self_attn.k_proj.weight" % player - new_state[name] = torch.tensor(state_k) - name = "model.blocks.%d.self_attn.self_attn.v_proj.weight" % player - new_state[name] = torch.tensor(state_v) - elif key_name.endswith("/o/kernel"): - name = "model.blocks.%d.self_attn.self_attn.out_proj.weight" % player - state = ( - vnp.reshape([vnp.shape[0] * vnp.shape[1], vnp.shape[2]]).transpose([1, 0]).copy() - ) # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/an"): - player = int(key_name[8:].split("/")[0]) - if key_name.endswith("/b"): - name = "model.blocks.%d.self_attn.norm.bias" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif key_name.endswith("/g"): - name = "model.blocks.%d.self_attn.norm.weight" % player - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - elif ( - key_name.startswith("model/wte") - or key_name.startswith("model/wpe") - or key_name.startswith("model/ete") - ): - nlayer = {"wte": "embed_tokens", "wpe": "position_embeddings", "ete": "extra_position_embeddings"}[ - key_name[-3:] - ] - name = "model.%s.weight" % nlayer - state = vnp.copy() # same in embedded - new_state[name] = torch.tensor(state) - if key_name.startswith("model/wte"): - name = "lm_head.weight" - state = vnp.copy() # same in embedded - new_state[name] = torch.tensor(state) - elif key_name.startswith("model/wob"): - name = "final_logits_bias" - state = vnp.copy() # same in embedded - state = state.reshape((1, -1)) - new_state[name] = torch.tensor(state) - elif key_name == "model/dense/kernel": - name = "model.last_project.weight" - state = vnp.transpose([1, 0]).copy() # Mesh-Tensorflow is a diagonal matrix - new_state[name] = torch.tensor(state) - elif key_name == "model/dense_1/bias": - name = "model.last_project.bias" - state = vnp.copy() # same because it is one dimensional - new_state[name] = torch.tensor(state) - torch.save(new_state, args.output) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="model converter.", formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--tf_model_dir", metavar="PATH", type=str, required=True, help="import model") - parser.add_argument("--output", metavar="PATH", type=str, required=True, help="output model") - args = parser.parse_args() - convert_tf_gptsan_to_pt(args) diff --git a/src/transformers/models/deprecated/jukebox/convert_jukebox.py b/src/transformers/models/deprecated/jukebox/convert_jukebox.py deleted file mode 100644 index 3380e38693..0000000000 --- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py +++ /dev/null @@ -1,279 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Jukebox checkpoints""" - -import argparse -import json -import os -from pathlib import Path - -import requests -import torch - -from transformers import JukeboxConfig, JukeboxModel -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -PREFIX = "https://openaipublic.azureedge.net/jukebox/models/" -MODEL_MAPPING = { - "jukebox-1b-lyrics": [ - "5b/vqvae.pth.tar", - "5b/prior_level_0.pth.tar", - "5b/prior_level_1.pth.tar", - "1b_lyrics/prior_level_2.pth.tar", - ], - "jukebox-5b-lyrics": [ - "5b/vqvae.pth.tar", - "5b/prior_level_0.pth.tar", - "5b/prior_level_1.pth.tar", - "5b_lyrics/prior_level_2.pth.tar", - ], -} - - -def replace_key(key): - if key.endswith(".model.1.bias") and len(key.split(".")) > 10: - key = key.replace(".model.1.bias", ".conv1d_1.bias") - elif key.endswith(".model.1.weight") and len(key.split(".")) > 10: - key = key.replace(".model.1.weight", ".conv1d_1.weight") - elif key.endswith(".model.3.bias") and len(key.split(".")) > 10: - key = key.replace(".model.3.bias", ".conv1d_2.bias") - elif key.endswith(".model.3.weight") and len(key.split(".")) > 10: - key = key.replace(".model.3.weight", ".conv1d_2.weight") - - if "conditioner_blocks.0." in key: - key = key.replace("conditioner_blocks.0", "conditioner_blocks") - - if "prime_prior" in key: - key = key.replace("prime_prior", "encoder") - - if ".emb." in key and "total" not in key and "absolute" not in key and "relative" not in key: - key = key.replace(".emb.", ".") - - if key.endswith("k"): # replace vqvae.X.k with vqvae.X.codebook - return key.replace(".k", ".codebook") - if "y_emb." in key: - return key.replace("y_emb.", "metadata_embedding.") - - if "x_emb.emb." in key: - key = key.replace("0.x_emb.emb", "embed_tokens") - - if "prime_state_ln" in key: - return key.replace("prime_state_ln", "encoder.final_layer_norm") - if ".ln" in key: - return key.replace(".ln", ".layer_norm") - if "_ln" in key: - return key.replace("_ln", "_layer_norm") - - if "prime_state_proj" in key: - return key.replace("prime_state_proj", "encoder.proj_in") - if "prime_x_out" in key: - return key.replace("prime_x_out", "encoder.lm_head") - if "prior.x_out" in key: - return key.replace("x_out", "fc_proj_out") - if "x_emb" in key: - return key.replace("x_emb", "embed_tokens") - - return key - - -def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping): - new_dict = {} - import re - - re_encoder_block_conv_in = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)") - re_encoder_block_resnet = re.compile( - r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_encoder_block_proj_out = re.compile(r"encoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)") - - re_decoder_block_conv_out = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).(bias|weight)") - re_decoder_block_resnet = re.compile( - r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_decoder_block_proj_in = re.compile(r"decoders.(\d*).level_blocks.(\d*).model.(\d*).(bias|weight)") - - re_prior_cond_conv_out = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).(bias|weight)") - re_prior_cond_resnet = re.compile( - r"conditioner_blocks.(\d*).cond.model.(\d*).(\d).model.(\d*).model.(\d*).(bias|weight)" - ) - re_prior_cond_proj_in = re.compile(r"conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)") - - for original_key, value in state_dict.items(): - # rename vqvae.encoder keys - if re_encoder_block_conv_in.fullmatch(original_key): - regex_match = re_encoder_block_conv_in.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}.{groups[-1]}" - key = re_encoder_block_conv_in.sub(re_new_key, original_key) - - elif re_encoder_block_resnet.fullmatch(original_key): - regex_match = re_encoder_block_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"encoders.{groups[0]}.level_blocks.{groups[1]}.downsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_encoder_block_resnet.sub(re_new_key, original_key) - - elif re_encoder_block_proj_out.fullmatch(original_key): - regex_match = re_encoder_block_proj_out.match(original_key) - groups = regex_match.groups() - re_new_key = f"encoders.{groups[0]}.level_blocks.{groups[1]}.proj_out.{groups[-1]}" - key = re_encoder_block_proj_out.sub(re_new_key, original_key) - - # rename vqvae.decoder keys - elif re_decoder_block_conv_out.fullmatch(original_key): - regex_match = re_decoder_block_conv_out.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - 2 - re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}.{groups[-1]}" - key = re_decoder_block_conv_out.sub(re_new_key, original_key) - - elif re_decoder_block_resnet.fullmatch(original_key): - regex_match = re_decoder_block_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[2]) * 2 + int(groups[3]) - 2 - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"decoders.{groups[0]}.level_blocks.{groups[1]}.upsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_decoder_block_resnet.sub(re_new_key, original_key) - - elif re_decoder_block_proj_in.fullmatch(original_key): - regex_match = re_decoder_block_proj_in.match(original_key) - groups = regex_match.groups() - re_new_key = f"decoders.{groups[0]}.level_blocks.{groups[1]}.proj_in.{groups[-1]}" - key = re_decoder_block_proj_in.sub(re_new_key, original_key) - - # rename prior cond.model to upsampler.upsample_block and resnet - elif re_prior_cond_conv_out.fullmatch(original_key): - regex_match = re_prior_cond_conv_out.match(original_key) - groups = regex_match.groups() - block_index = int(groups[1]) * 2 + int(groups[2]) - 2 - re_new_key = f"conditioner_blocks.upsampler.upsample_block.{block_index}.{groups[-1]}" - key = re_prior_cond_conv_out.sub(re_new_key, original_key) - - elif re_prior_cond_resnet.fullmatch(original_key): - regex_match = re_prior_cond_resnet.match(original_key) - groups = regex_match.groups() - block_index = int(groups[1]) * 2 + int(groups[2]) - 2 - conv_index = {"1": 1, "3": 2}[groups[-2]] - prefix = f"conditioner_blocks.upsampler.upsample_block.{block_index}." - resnet_block = f"resnet_block.{groups[-3]}.conv1d_{conv_index}.{groups[-1]}" - re_new_key = prefix + resnet_block - key = re_prior_cond_resnet.sub(re_new_key, original_key) - - elif re_prior_cond_proj_in.fullmatch(original_key): - regex_match = re_prior_cond_proj_in.match(original_key) - groups = regex_match.groups() - re_new_key = f"conditioner_blocks.upsampler.proj_in.{groups[-1]}" - key = re_prior_cond_proj_in.sub(re_new_key, original_key) - - # keep original key - else: - key = original_key - - key = replace_key(key) - - if f"{key_prefix}.{key}" not in model_state_dict or key is None: - print(f"failed converting {original_key} to {key}, does not match") - - # handle mismatched shape - elif value.shape != model_state_dict[f"{key_prefix}.{key}"].shape: - val = model_state_dict[f"{key_prefix}.{key}"] - print(f"{original_key}-> {key} : \nshape {val.shape} and {value.shape}, do not match") - key = original_key - - mapping[key] = original_key - new_dict[key] = value - - return new_dict - - -@torch.no_grad() -def convert_openai_checkpoint(model_name=None, pytorch_dump_folder_path=None): - """ - Copy/paste/tweak model's weights to our Jukebox structure. - """ - for file in MODEL_MAPPING[model_name]: - if not os.path.isfile(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}"): - r = requests.get(f"{PREFIX}{file}", allow_redirects=True) - os.makedirs(f"{pytorch_dump_folder_path}/", exist_ok=True) - open(f"{pytorch_dump_folder_path}/{file.split('/')[-1]}", "wb").write(r.content) - - model_to_convert = MODEL_MAPPING[model_name.split("/")[-1]] - - config = JukeboxConfig.from_pretrained(model_name) - model = JukeboxModel(config) - - weight_dict = [] - mapping = {} - for i, dict_name in enumerate(model_to_convert): - old_dic = torch.load(f"{pytorch_dump_folder_path}/{dict_name.split('/')[-1]}", weights_only=True)["model"] - - new_dic = {} - for k in old_dic.keys(): - if k.endswith(".b"): - new_dic[k.replace("b", "bias")] = old_dic[k] - elif k.endswith(".w"): - new_dic[k.replace("w", "weight")] = old_dic[k] - elif "level_2" not in dict_name and "cond.model." in k: - new_dic[k.replace(".blocks.", ".model.")] = old_dic[k] - else: - new_dic[k] = old_dic[k] - - key_prefix = "vqvae" if i == 0 else f"priors.{3 - i}" - new_dic = fix_jukebox_keys(new_dic, model.state_dict(), key_prefix, mapping) - weight_dict.append(new_dic) - - vqvae_state_dict = weight_dict.pop(0) - model.vqvae.load_state_dict(vqvae_state_dict) - for i in range(len(weight_dict)): - model.priors[i].load_state_dict(weight_dict[2 - i]) - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - with open(f"{pytorch_dump_folder_path}/mapping.json", "w") as txtfile: - json.dump(mapping, txtfile) - - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - return weight_dict - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="jukebox-5b-lyrics", - type=str, - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="jukebox-5b-lyrics-converted", - type=str, - help="Path to the output PyTorch model directory.", - ) - args = parser.parse_args() - convert_openai_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index c6dbb12890..0000000000 --- a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,298 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Convert Mega pretrained checkpoint. Built to convert the Masked LM checkpoint located at -https://huggingface.co/mnaylor/mega-wikitext-103 - -Requirements: - - clone the Mega repo and install fairseq from there - 1. git clone https://github.com/facebookresearch/mega.git - 2. cd mega && pip install -e - - clone the pretrained weights for the original implementation from the hugging face repo - * use this location as the path for pretrained weights -""" - -import argparse - -# utilities to import the model weights and config file -import os -import pickle as pkl - -# PyTorch + new model classes -import torch -from torch import nn - -from transformers import AutoTokenizer, MegaConfig, MegaForMaskedLM - - -# import the EncoderLayer class used to pretrain -# !! NOTE !! this requires the version of fairseq that is built when you install the Mega source -try: - from fairseq.modules.mega_layer import MegaEncoderLayer -except ImportError: - raise ImportError("You need to install the version of fairseq from the Mega repo!") - - -# define the wrapper classes used to train the MLM (see colab notebook below) -# https://colab.research.google.com/drive/1qfUO6o5HRdxBblWlw058HVyvaEPhPpH8?usp=sharing -# MegaLM outputs hidden states -class MegaLM(nn.Module): - "The base class for our Mega encoder - given input IDs, embed text and return encoder output" - - def __init__(self, mega_args, depth, vocab_size): - super().__init__() - self.mega_args = mega_args - self.embedding_layer = nn.Embedding(vocab_size, self.mega_args.encoder_embed_dim) - self.encoders = nn.ModuleList([MegaEncoderLayer(self.mega_args) for _ in range(depth)]) - self.depth = depth - - def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0): - """ - Code for a forward pass - expects input_ids and attention_mask to come from a Hugging Face tokenizer as PyTorch - tensors, and returns a tensor of size (batch, n_classes) containing classification logits - - Other options: - - batch_first: boolean indicating whether the batch dimension is first in input_ids (default: True, which - aligns with the HF tokenizer behavior) - - ignore_mask_value: the value in attention_mask that identifies tokens that should be ignored (default: 0, - which aligns with HF tokenizer) - """ - - # Mega expects embeddings to be (time, batch, embedding size), but - # Hugging Face returns tokens as (batch, time) - if batch_first: - input_ids = input_ids.T - - # to make things more confusing, Mega expects the attention mask to - # be (batch, time), but with values of 0 (normal token) and 1 (ignore token) - # which is the opposite of what HF returns - if ignore_mask_value == 0: - attention_mask = 1 - attention_mask - - # get token embeddings from IDs - embeds = self.embedding_layer(input_ids) - - # pass through the Mega layers - # input is (time, batch, encoder dim) and output is the same - for encoder in self.encoders: - embeds = encoder(embeds, attention_mask) - - # return according to the shape specified - if batch_first: - # (T, B, H) --> (B, T, H) - return torch.transpose(embeds, 0, 1) - else: - return embeds - - -# renamed from MegaForMaskedLM to avoid confusion with new module -class OriginalMegaForMaskedLM(nn.Module): - "A wrapper class for doing masked language modeling with Mega" - - def __init__(self, mega_args, depth, vocab_size): - super().__init__() - self.mega = MegaLM(mega_args, depth, vocab_size) - self.mlm_head = nn.Linear(mega_args.encoder_embed_dim, vocab_size) - self.dropout = nn.Dropout(p=0.1) - - def forward(self, input_ids, attention_mask, batch_first=True, ignore_mask_value=0): - """ - Perform a forward pass through the Mega encoder and the masked LM head. Returns logits for each vocabulary - entry. - - If `batch_first` (default to align with Hugging Face tokenizer behavior), output will have the shape (Batch - size, Sequence length, Vocab size); otherwise (S, B, V) - """ - encoder_output = self.mega(input_ids, attention_mask, batch_first, ignore_mask_value) - return self.mlm_head(self.dropout(encoder_output)) - - -# code to convert the checkpoint located in the user-specified location -def convert_checkpoint_to_huggingface(pretrained_checkpoint_path, output_path, includes_tokenizer): - with open(os.path.join(pretrained_checkpoint_path, "model_args.pkl"), "rb") as f: - mega_original_args = pkl.load(f) - - # load the original encoder - original_mlm = OriginalMegaForMaskedLM(**mega_original_args).eval() - - # load its weights - print( - "Original Mega encoder:", - original_mlm.mega.load_state_dict( - torch.load( - os.path.join(pretrained_checkpoint_path, "encoder_weights.pt"), map_location="cpu", weights_only=True - ) - ), - ) - print( - "Original Mega MLM layer:", - original_mlm.mlm_head.load_state_dict( - torch.load( - os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu", weights_only=True - ) - ), - ) - - # create a new config from the old one - hf_config = MegaConfig( - num_hidden_layers=mega_original_args["depth"], - vocab_size=mega_original_args["vocab_size"], - hidden_size=mega_original_args["mega_args"].encoder_embed_dim, - shared_representation_size=mega_original_args["mega_args"].encoder_z_dim, - intermediate_size=mega_original_args["mega_args"].encoder_hidden_dim, - ema_projection_size=mega_original_args["mega_args"].encoder_n_dim, - dropout_prob=mega_original_args["mega_args"].dropout, - attention_probs_dropout_prob=mega_original_args["mega_args"].attention_dropout, - hidden_dropout_prob=mega_original_args["mega_args"].hidden_dropout, - activation=mega_original_args["mega_args"].activation_fn, - attention_activation=mega_original_args["mega_args"].attention_activation_fn, - bidirectional=mega_original_args["mega_args"].bidirectional, - use_chunking=mega_original_args["mega_args"].encoder_chunk_size > 0, - chunk_size=mega_original_args["mega_args"].encoder_chunk_size, - truncation=mega_original_args["mega_args"].truncation_length, - normalization_type=mega_original_args["mega_args"].normalization_type, - normalize_before_mega=True, - norm_affine=True, - use_feature_dropout=mega_original_args["mega_args"].feature_dropout, - relative_positional_bias=mega_original_args["mega_args"].rel_pos_bias, - max_positions=mega_original_args["mega_args"].max_source_positions, - nffn_hidden_size=mega_original_args["mega_args"].encoder_ffn_embed_dim, - normalize_before_ffn=mega_original_args["mega_args"].normalize_before, - # new arguments added for HF implementation - nffn_activation_dropout_prob=0.0, - add_token_type_embeddings=False, - add_lm_hidden_dense_layer=False, - ) - - hf_mlm = MegaForMaskedLM(hf_config).eval() - - # the originl checkpoint just uses nn.Embedding for the word embeddings - # we use a wrapper module for embeddings to add support for positional embeddings - hf_mlm.mega.embedding_layer.word_embeddings.weight = original_mlm.mega.embedding_layer.weight - - # modify the state dictionary of the original checkpoint to account for naming issues in the Hugging Face - # ecosystem -- any names containing "beta" or "gamma" aren't safe to use and are renamed upon _load_pretrained, - # also renaming previously confusing parameter names - original_state_dict = original_mlm.mega.encoders.state_dict() - updated_keys = {} - for module_name in original_state_dict.keys(): - new_module_name = None - # have to handle gamma, beta, and alpha differently due to their use - # in multiple modules within the original repository; - # beta is used in EMA, MovingAverageGatedAttention, and RotaryRelativePositionalBias, and must be renamed due to flax/tf weights - # the EMA sublayer was renamed from "move" to "ema_gate" for readability, so that is also done here - if "beta" in module_name: - # EMA sub-layers were always called "move" in the original repo - if "move.beta" in module_name: - new_module_name = module_name.replace("move.beta", "ema_gate.ema_expansion_matrix") - elif "mega_layer.beta" in module_name: - new_module_name = module_name.replace("beta", "qk_bias") - else: - new_module_name = module_name.replace("beta", "b_param") - # beta is used in EMA and MovingAverageGatedAttention, and must be renamed due to flax/tf weights - elif "gamma" in module_name: - if "move.gamma" in module_name: - new_module_name = module_name.replace("move.gamma", "ema_gate.kernel_projection_matrix") - elif "mega_layer.gamma" in module_name: - new_module_name = module_name.replace("gamma", "qk_weight") - else: - new_module_name = module_name.replace("gamma", "g_param") - # alpha is used in EMA and positional bias; renaming to improve readability - elif "move.alpha" in module_name: - new_module_name = module_name.replace("move.alpha", "ema_gate.decay_factor") - # delta is only used in EMA; renaming to improve readability - elif "move.delta" in module_name: - new_module_name = module_name.replace("move.delta", "ema_gate.damping_factor") - # omega is only used in EMA; renaming to improve readability - elif "omega" in module_name: - new_module_name = module_name.replace("move.omega", "ema_gate.residual_weight") - - if new_module_name: - updated_keys[module_name] = new_module_name - - if len(updated_keys) != 0: - print(f"Renaming these keys: {updated_keys.keys()}") - else: - print("No need to rename state dict entries") - for old, new in updated_keys.items(): - original_state_dict[new] = original_state_dict.pop(old) - - # now attempt to load the state dictionary with updated names - # note that we now call it `mega.layers` instead of `mega.encoders` due to hugging face style - print("HF Mega encoder:", hf_mlm.mega.layers.load_state_dict(original_state_dict)) - - # load the MLM head weights directly - print( - "HF Mega MLM layer:", - hf_mlm.mlm_head.load_state_dict( - torch.load( - os.path.join(pretrained_checkpoint_path, "mlm_head_weights.pt"), map_location="cpu", weights_only=True - ) - ), - ) - - # test on a randomly generated input sequence - input_ids = torch.randint(0, hf_config.vocab_size, size=(4, 256)) - input_mask = torch.ones_like(input_ids) - # mask a few tokens to make sure masking is applied appropriately :) - input_mask[:, -10:] = 0 - - # run forward passes - original_output = original_mlm(input_ids, input_mask, batch_first=True, ignore_mask_value=0) - hf_output = hf_mlm(input_ids, input_mask)[0] - - # print shapes and diff - print(f"original output {original_output.shape}") - print(f"hf output {hf_output.shape}") - print(f"max diff: {(original_output - hf_output).max()}") # 0.0 - success = torch.allclose(original_output, hf_output, atol=1e-3) - - if success: - print("Yay!") - hf_mlm.save_pretrained(output_path) - else: - raise RuntimeError(f"Something's broken :(\nOriginal:\n{original_output}\n\nHF\n{hf_output}\n{hf_mlm}") - - if includes_tokenizer: - print("Transferring tokenizer") - tokenizer = AutoTokenizer.from_pretrained(pretrained_checkpoint_path) - tokenizer.save_pretrained(output_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--pretrained_checkpoint_path", - default=None, - type=str, - required=True, - help="Point to the directory containing your model weights using the official Mega repo", - ) - - parser.add_argument( - "--output_path", default=None, type=str, required=True, help="Location to save the Hugging Face version" - ) - - parser.add_argument( - "--includes_tokenizer", - action="store_true", - help="Use this flag if there is a Hugging Face tokenizer in the original checkpoint repo", - ) - - args = parser.parse_args() - - convert_checkpoint_to_huggingface(args.pretrained_checkpoint_path, args.output_path, args.includes_tokenizer) diff --git a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index da7f780667..0000000000 --- a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,70 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""TrajectoryTransformer pytorch checkpoint conversion""" - -import torch -import trajectory.utils as utils - -from transformers import TrajectoryTransformerModel - - -class Parser(utils.Parser): - dataset: str = "halfcheetah-medium-expert-v2" - config: str = "config.offline" - - -def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device): - """Converting Sequential blocks to ModuleList""" - - gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device) - trajectory_transformer = TrajectoryTransformerModel(gpt.config) - - trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict()) - trajectory_transformer.pos_emb = gpt.pos_emb - trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict()) - trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict()) - trajectory_transformer.head.load_state_dict(gpt.head.state_dict()) - - for i, block in enumerate(gpt.blocks): - trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict()) - trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict()) - trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict()) - - trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict()) - trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict()) - trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict()) - trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict()) - - torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin") - - -if __name__ == "__main__": - """ - To run this script you will need to install the original repository to run the original model. You can find it - here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the - original pytorch checkpoints. - - Run with the command: - - ```sh - >>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset - ... --gpt_loadpath - ``` - """ - - args = Parser().parse_args("plan") - convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch( - args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device - ) diff --git a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index 2c7b687c4d..0000000000 --- a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Transformer XL checkpoint and datasets.""" - -import argparse -import os -import pickle -import sys - -import torch - -from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl -from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils -from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - -# We do this to be able to load python 2 datasets pickles -# See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 -data_utils.Vocab = data_utils.TransfoXLTokenizer -data_utils.Corpus = data_utils.TransfoXLCorpus -sys.modules["data_utils"] = data_utils -sys.modules["vocabulary"] = data_utils - - -def convert_transfo_xl_checkpoint_to_pytorch( - tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file -): - if transfo_xl_dataset_file: - # Convert a pre-processed corpus (see original TensorFlow repo) - with open(transfo_xl_dataset_file, "rb") as fp: - corpus = pickle.load(fp, encoding="latin1") - # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) - pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["pretrained_vocab_file"] - print(f"Save vocabulary to {pytorch_vocab_dump_path}") - corpus_vocab_dict = corpus.vocab.__dict__ - torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) - - corpus_dict_no_vocab = corpus.__dict__ - corpus_dict_no_vocab.pop("vocab", None) - pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME - print(f"Save dataset to {pytorch_dataset_dump_path}") - torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) - - if tf_checkpoint_path: - # Convert a pre-trained TensorFlow model - config_path = os.path.abspath(transfo_xl_config_file) - tf_path = os.path.abspath(tf_checkpoint_path) - - print(f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}.") - # Initialise PyTorch model - if transfo_xl_config_file == "": - config = TransfoXLConfig() - else: - config = TransfoXLConfig.from_json_file(transfo_xl_config_file) - print(f"Building PyTorch model from configuration: {config}") - model = TransfoXLLMHeadModel(config) - - model = load_tf_weights_in_transfo_xl(model, config, tf_path) - # Save pytorch-model - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) - print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=True, - help="Path to the folder to store the PyTorch model or dataset/vocab.", - ) - parser.add_argument( - "--tf_checkpoint_path", - default="", - type=str, - help="An optional path to a TensorFlow checkpoint path to be converted.", - ) - parser.add_argument( - "--transfo_xl_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained BERT model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--transfo_xl_dataset_file", - default="", - type=str, - help="An optional dataset file to be converted in a vocabulary.\n" - "Given the files are in the pickle format, please be wary of passing it files you trust.", - ) - args = parser.parse_args() - convert_transfo_xl_checkpoint_to_pytorch( - args.tf_checkpoint_path, - args.transfo_xl_config_file, - args.pytorch_dump_folder_path, - args.transfo_xl_dataset_file, - ) diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py deleted file mode 100644 index b509d60d12..0000000000 --- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py +++ /dev/null @@ -1,290 +0,0 @@ -# coding=utf-8 -# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert VAN checkpoints from the original repository. - -URL: https://github.com/Visual-Attention-Network/VAN-Classification""" - -import argparse -import json -import sys -from dataclasses import dataclass, field -from functools import partial -from pathlib import Path -from typing import List, Optional - -import torch -import torch.nn as nn -from huggingface_hub import cached_download, hf_hub_download -from torch import Tensor - -from transformers import AutoImageProcessor, VanConfig, VanForImageClassification -from transformers.models.deprecated.van.modeling_van import VanLayerScaling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -@dataclass -class Tracker: - module: nn.Module - traced: List[nn.Module] = field(default_factory=list) - handles: list = field(default_factory=list) - - def _forward_hook(self, m, inputs: Tensor, outputs: Tensor): - has_not_submodules = len(list(m.modules())) == 1 or isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) - if has_not_submodules: - if not isinstance(m, VanLayerScaling): - self.traced.append(m) - - def __call__(self, x: Tensor): - for m in self.module.modules(): - self.handles.append(m.register_forward_hook(self._forward_hook)) - self.module(x) - [x.remove() for x in self.handles] - return self - - @property - def parametrized(self): - # check the len of the state_dict keys to see if we have learnable params - return list(filter(lambda x: len(list(x.state_dict().keys())) > 0, self.traced)) - - -@dataclass -class ModuleTransfer: - src: nn.Module - dest: nn.Module - verbose: int = 0 - src_skip: List = field(default_factory=list) - dest_skip: List = field(default_factory=list) - - def __call__(self, x: Tensor): - """ - Transfer the weights of `self.src` to `self.dest` by performing a forward pass using `x` as input. Under the - hood we tracked all the operations in both modules. - """ - dest_traced = Tracker(self.dest)(x).parametrized - src_traced = Tracker(self.src)(x).parametrized - - src_traced = list(filter(lambda x: type(x) not in self.src_skip, src_traced)) - dest_traced = list(filter(lambda x: type(x) not in self.dest_skip, dest_traced)) - - if len(dest_traced) != len(src_traced): - raise Exception( - f"Numbers of operations are different. Source module has {len(src_traced)} operations while" - f" destination module has {len(dest_traced)}." - ) - - for dest_m, src_m in zip(dest_traced, src_traced): - dest_m.load_state_dict(src_m.state_dict()) - if self.verbose == 1: - print(f"Transferred from={src_m} to={dest_m}") - - -def copy_parameters(from_model: nn.Module, our_model: nn.Module) -> nn.Module: - # nn.Parameter cannot be tracked by the Tracker, thus we need to manually convert them - from_state_dict = from_model.state_dict() - our_state_dict = our_model.state_dict() - config = our_model.config - all_keys = [] - for stage_idx in range(len(config.hidden_sizes)): - for block_id in range(config.depths[stage_idx]): - from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_1" - to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.attention_scaling.weight" - - all_keys.append((from_key, to_key)) - from_key = f"block{stage_idx + 1}.{block_id}.layer_scale_2" - to_key = f"van.encoder.stages.{stage_idx}.layers.{block_id}.mlp_scaling.weight" - - all_keys.append((from_key, to_key)) - - for from_key, to_key in all_keys: - our_state_dict[to_key] = from_state_dict.pop(from_key) - - our_model.load_state_dict(our_state_dict) - return our_model - - -def convert_weight_and_push( - name: str, - config: VanConfig, - checkpoint: str, - from_model: nn.Module, - save_directory: Path, - push_to_hub: bool = True, -): - print(f"Downloading weights for {name}...") - checkpoint_path = cached_download(checkpoint) - print(f"Converting {name}...") - from_state_dict = torch.load(checkpoint_path, weights_only=True)["state_dict"] - from_model.load_state_dict(from_state_dict) - from_model.eval() - with torch.no_grad(): - our_model = VanForImageClassification(config).eval() - module_transfer = ModuleTransfer(src=from_model, dest=our_model) - x = torch.randn((1, 3, 224, 224)) - module_transfer(x) - our_model = copy_parameters(from_model, our_model) - - if not torch.allclose(from_model(x), our_model(x).logits): - raise ValueError("The model logits don't match the original one.") - - checkpoint_name = name - print(checkpoint_name) - - if push_to_hub: - our_model.push_to_hub( - repo_path_or_name=save_directory / checkpoint_name, - commit_message="Add model", - use_temp_dir=True, - ) - - # we can use the convnext one - image_processor = AutoImageProcessor.from_pretrained("facebook/convnext-base-224-22k-1k") - image_processor.push_to_hub( - repo_path_or_name=save_directory / checkpoint_name, - commit_message="Add image processor", - use_temp_dir=True, - ) - - print(f"Pushed {checkpoint_name}") - - -def convert_weights_and_push(save_directory: Path, model_name: Optional[str] = None, push_to_hub: bool = True): - filename = "imagenet-1k-id2label.json" - num_labels = 1000 - - repo_id = "huggingface/label-files" - num_labels = num_labels - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - id2label = id2label - label2id = {v: k for k, v in id2label.items()} - - ImageNetPreTrainedConfig = partial(VanConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) - - names_to_config = { - "van-tiny": ImageNetPreTrainedConfig( - hidden_sizes=[32, 64, 160, 256], - depths=[3, 3, 5, 2], - mlp_ratios=[8, 8, 4, 4], - ), - "van-small": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[2, 2, 4, 2], - mlp_ratios=[8, 8, 4, 4], - ), - "van-base": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[3, 3, 12, 3], - mlp_ratios=[8, 8, 4, 4], - ), - "van-large": ImageNetPreTrainedConfig( - hidden_sizes=[64, 128, 320, 512], - depths=[3, 5, 27, 3], - mlp_ratios=[8, 8, 4, 4], - ), - } - - names_to_original_models = { - "van-tiny": van_tiny, - "van-small": van_small, - "van-base": van_base, - "van-large": van_large, - } - - names_to_original_checkpoints = { - "van-tiny": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Tiny-original/resolve/main/van_tiny_754.pth.tar" - ), - "van-small": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Small-original/resolve/main/van_small_811.pth.tar" - ), - "van-base": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Base-original/resolve/main/van_base_828.pth.tar" - ), - "van-large": ( - "https://huggingface.co/Visual-Attention-Network/VAN-Large-original/resolve/main/van_large_839.pth.tar" - ), - } - - if model_name: - convert_weight_and_push( - model_name, - names_to_config[model_name], - checkpoint=names_to_original_checkpoints[model_name], - from_model=names_to_original_models[model_name](), - save_directory=save_directory, - push_to_hub=push_to_hub, - ) - else: - for model_name, config in names_to_config.items(): - convert_weight_and_push( - model_name, - config, - checkpoint=names_to_original_checkpoints[model_name], - from_model=names_to_original_models[model_name](), - save_directory=save_directory, - push_to_hub=push_to_hub, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default=None, - type=str, - help=( - "The name of the model you wish to convert, it must be one of the supported resnet* architecture," - " currently: van-tiny/small/base/large. If `None`, all of them will the converted." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=Path, - required=True, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--van_dir", - required=True, - type=Path, - help=( - "A path to VAN's original implementation directory. You can download from here:" - " https://github.com/Visual-Attention-Network/VAN-Classification" - ), - ) - parser.add_argument( - "--push_to_hub", - default=True, - type=bool, - required=False, - help="If True, push model and image processor to the hub.", - ) - - args = parser.parse_args() - pytorch_dump_folder_path: Path = args.pytorch_dump_folder_path - pytorch_dump_folder_path.mkdir(exist_ok=True, parents=True) - van_dir = args.van_dir - # append the path to the parents to maskformer dir - sys.path.append(str(van_dir.parent)) - from van.models.van import van_base, van_large, van_small, van_tiny - - convert_weights_and_push(pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py deleted file mode 100644 index 1d717d74c9..0000000000 --- a/src/transformers/models/deprecated/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py +++ /dev/null @@ -1,282 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ViT hybrid checkpoints from the timm library.""" - -import argparse -import json -from pathlib import Path - -import requests -import timm -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from timm.data import resolve_data_config -from timm.data.transforms_factory import create_transform - -from transformers import ( - BitConfig, - ViTHybridConfig, - ViTHybridForImageClassification, - ViTHybridImageProcessor, - ViTHybridModel, -) -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, base_model=False): - rename_keys = [] - - # fmt: off - # stem: - rename_keys.append(("cls_token", "vit.embeddings.cls_token")) - rename_keys.append(("pos_embed", "vit.embeddings.position_embeddings")) - - rename_keys.append(("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias")) - - # backbone - rename_keys.append(("patch_embed.backbone.stem.conv.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.convolution.weight")) - rename_keys.append(("patch_embed.backbone.stem.norm.weight", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.weight")) - rename_keys.append(("patch_embed.backbone.stem.norm.bias", "vit.embeddings.patch_embeddings.backbone.bit.embedder.norm.bias")) - - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv1.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm1.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm1.bias")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv2.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm2.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm2.bias")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.conv3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.conv3.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.{layer_idx}.norm3.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.{layer_idx}.norm3.bias")) - - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.conv.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.conv.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.weight", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.weight")) - rename_keys.append((f"patch_embed.backbone.stages.{stage_idx}.blocks.0.downsample.norm.bias", f"vit.embeddings.patch_embeddings.backbone.bit.encoder.stages.{stage_idx}.layers.0.downsample.norm.bias")) - - # transformer encoder - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias")) - - if base_model: - # layernorm + pooler - rename_keys.extend( - [ - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ("pre_logits.fc.weight", "pooler.dense.weight"), - ("pre_logits.fc.bias", "pooler.dense.bias"), - ] - ) - - # if just the base model, we should remove "vit" from all keys that start with "vit" - rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys] - else: - # layernorm + classification head - rename_keys.extend( - [ - ("norm.weight", "vit.layernorm.weight"), - ("norm.bias", "vit.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - # fmt: on - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, base_model=False): - for i in range(config.num_hidden_layers): - if base_model: - prefix = "" - else: - prefix = "vit." - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def remove_classification_head_(state_dict): - ignore_keys = ["head.weight", "head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our ViT structure. - """ - - # define default ViT hybrid configuration - backbone_config = BitConfig( - global_padding="same", - layer_type="bottleneck", - depths=(3, 4, 9), - out_features=["stage3"], - embedding_dynamic_padding=True, - ) - config = ViTHybridConfig(backbone_config=backbone_config, image_size=384, num_labels=1000) - base_model = False - - # load original model from timm - timm_model = timm.create_model(vit_name, pretrained=True) - timm_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = timm_model.state_dict() - if base_model: - remove_classification_head_(state_dict) - rename_keys = create_rename_keys(config, base_model) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, base_model) - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load HuggingFace model - if vit_name[-5:] == "in21k": - model = ViTHybridModel(config).eval() - else: - model = ViTHybridForImageClassification(config).eval() - model.load_state_dict(state_dict) - - # create image processor - transform = create_transform(**resolve_data_config({}, model=timm_model)) - timm_transforms = transform.transforms - - pillow_resamplings = { - "bilinear": PILImageResampling.BILINEAR, - "bicubic": PILImageResampling.BICUBIC, - "nearest": PILImageResampling.NEAREST, - } - - processor = ViTHybridImageProcessor( - do_resize=True, - size={"shortest_edge": timm_transforms[0].size}, - resample=pillow_resamplings[timm_transforms[0].interpolation.value], - do_center_crop=True, - crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]}, - do_normalize=True, - image_mean=timm_transforms[-1].mean.tolist(), - image_std=timm_transforms[-1].std.tolist(), - ) - - image = prepare_img() - timm_pixel_values = transform(image).unsqueeze(0) - pixel_values = processor(image, return_tensors="pt").pixel_values - - # verify pixel values - assert torch.allclose(timm_pixel_values, pixel_values) - - # verify logits - with torch.no_grad(): - outputs = model(pixel_values) - logits = outputs.logits - - print("Predicted class:", logits.argmax(-1).item()) - if base_model: - timm_pooled_output = timm_model.forward_features(pixel_values) - assert timm_pooled_output.shape == outputs.pooler_output.shape - assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3) - else: - timm_logits = timm_model(pixel_values) - assert timm_logits.shape == outputs.logits.shape - assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {vit_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor to the hub {vit_name}") - model.push_to_hub(f"ybelkada/{vit_name}") - processor.push_to_hub(f"ybelkada/{vit_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--vit_name", - default="vit_base_r50_s16_384", - type=str, - help="Name of the hybrid ViT timm model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub." - ) - - args = parser.parse_args() - convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py deleted file mode 100644 index f07a76b2b2..0000000000 --- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py +++ /dev/null @@ -1,368 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Depth Anything checkpoints from the original repository. URL: -https://github.com/LiheYoung/Depth-Anything""" - -import argparse -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "small" in model_name: - out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 64 - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - out_indices = [3, 6, 9, 12] if "v2" in model_name else [9, 10, 11, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 128 - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - out_indices = [5, 12, 18, 24] if "v2" in model_name else [21, 22, 23, 24] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 256 - neck_hidden_sizes = [256, 512, 1024, 1024] - else: - raise NotImplementedError(f"Model not supported: {model_name}") - - if "metric" in model_name: - depth_estimation_type = "metric" - max_depth = 20 if "indoor" in model_name else 80 - else: - depth_estimation_type = "relative" - max_depth = None - - config = DepthAnythingConfig( - reassemble_hidden_size=backbone_config.hidden_size, - patch_size=backbone_config.patch_size, - backbone_config=backbone_config, - fusion_hidden_size=fusion_hidden_size, - neck_hidden_sizes=neck_hidden_sizes, - depth_estimation_type=depth_estimation_type, - max_depth=max_depth, - ) - - return config - - -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("pretrained.mask_token", "backbone.embeddings.mask_token")) - rename_keys.append(("pretrained.pos_embed", "backbone.embeddings.position_embeddings")) - rename_keys.append(("pretrained.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transformer encoder - for i in range(config.backbone_config.num_hidden_layers): - rename_keys.append((f"pretrained.blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"pretrained.blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) - rename_keys.append((f"pretrained.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"pretrained.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"pretrained.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"pretrained.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"pretrained.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) - rename_keys.append((f"pretrained.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - - # Head - rename_keys.append(("pretrained.norm.weight", "backbone.layernorm.weight")) - rename_keys.append(("pretrained.norm.bias", "backbone.layernorm.bias")) - - # activation postprocessing (readout projections + resize blocks) - # Depth Anything does not use CLS token => readout_projects not required - - for i in range(4): - rename_keys.append((f"depth_head.projects.{i}.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"depth_head.projects.{i}.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - if i != 2: - rename_keys.append((f"depth_head.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"depth_head.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"depth_head.scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"depth_head.scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - rename_keys.append(("depth_head.scratch.output_conv1.weight", "head.conv1.weight")) - rename_keys.append(("depth_head.scratch.output_conv1.bias", "head.conv1.bias")) - rename_keys.append(("depth_head.scratch.output_conv2.0.weight", "head.conv2.weight")) - rename_keys.append(("depth_head.scratch.output_conv2.0.bias", "head.conv2.bias")) - rename_keys.append(("depth_head.scratch.output_conv2.2.weight", "head.conv3.weight")) - rename_keys.append(("depth_head.scratch.output_conv2.2.bias", "head.conv3.bias")) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - hidden_size = config.backbone_config.hidden_size - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"pretrained.blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -name_to_checkpoint = { - "depth-anything-small": "pytorch_model.bin", - "depth-anything-base": "pytorch_model.bin", - "depth-anything-large": "pytorch_model.bin", - "depth-anything-v2-small": "depth_anything_v2_vits.pth", - "depth-anything-v2-base": "depth_anything_v2_vitb.pth", - "depth-anything-v2-large": "depth_anything_v2_vitl.pth", - "depth-anything-v2-metric-indoor-small": "depth_anything_v2_metric_hypersim_vits.pth", - "depth-anything-v2-metric-indoor-base": "depth_anything_v2_metric_hypersim_vitb.pth", - "depth-anything-v2-metric-indoor-large": "depth_anything_v2_metric_hypersim_vitl.pth", - "depth-anything-v2-metric-outdoor-small": "depth_anything_v2_metric_vkitti_vits.pth", - "depth-anything-v2-metric-outdoor-base": "depth_anything_v2_metric_vkitti_vitb.pth", - "depth-anything-v2-metric-outdoor-large": "depth_anything_v2_metric_vkitti_vitl.pth", - # v2-giant pending -} - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration - config = get_dpt_config(model_name) - - model_name_to_repo = { - "depth-anything-small": "LiheYoung/depth_anything_vits14", - "depth-anything-base": "LiheYoung/depth_anything_vitb14", - "depth-anything-large": "LiheYoung/depth_anything_vitl14", - "depth-anything-v2-small": "depth-anything/Depth-Anything-V2-Small", - "depth-anything-v2-base": "depth-anything/Depth-Anything-V2-Base", - "depth-anything-v2-large": "depth-anything/Depth-Anything-V2-Large", - "depth-anything-v2-metric-indoor-small": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Small", - "depth-anything-v2-metric-indoor-base": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Base", - "depth-anything-v2-metric-indoor-large": "depth-anything/Depth-Anything-V2-Metric-Hypersim-Large", - "depth-anything-v2-metric-outdoor-small": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Small", - "depth-anything-v2-metric-outdoor-base": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Base", - "depth-anything-v2-metric-outdoor-large": "depth-anything/Depth-Anything-V2-Metric-VKITTI-Large", - } - - # load original state_dict - repo_id = model_name_to_repo[model_name] - filename = name_to_checkpoint[model_name] - filepath = hf_hub_download( - repo_id=repo_id, - filename=f"{filename}", - ) - - state_dict = torch.load(filepath, map_location="cpu", weights_only=True) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DepthAnythingForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - processor = DPTImageProcessor( - do_resize=True, - size={"height": 518, "width": 518}, - ensure_multiple_of=14, - keep_aspect_ratio=True, - do_rescale=True, - do_normalize=True, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - ) - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - pixel_values = processor(image, return_tensors="pt").pixel_values - - # Verify forward pass - with torch.no_grad(): - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values:", predicted_depth[0, :3, :3]) - - # assert logits - if verify_logits: - expected_shape = torch.Size([1, 518, 686]) - if model_name == "depth-anything-small": - expected_slice = torch.tensor( - [[8.8204, 8.6468, 8.6195], [8.3313, 8.6027, 8.7526], [8.6526, 8.6866, 8.7453]], - ) - elif model_name == "depth-anything-base": - expected_slice = torch.tensor( - [[26.3997, 26.3004, 26.3928], [26.2260, 26.2092, 26.3427], [26.0719, 26.0483, 26.1254]], - ) - elif model_name == "depth-anything-large": - expected_slice = torch.tensor( - [[87.9968, 87.7493, 88.2704], [87.1927, 87.6611, 87.3640], [86.7789, 86.9469, 86.7991]] - ) - elif model_name == "depth-anything-v2-small": - expected_slice = torch.tensor( - [[2.6751, 2.6211, 2.6571], [2.5820, 2.6138, 2.6271], [2.6160, 2.6141, 2.6306]] - ) - elif model_name == "depth-anything-v2-base": - expected_slice = torch.tensor( - [[4.3576, 4.3723, 4.3908], [4.3231, 4.3146, 4.3611], [4.3016, 4.3170, 4.3121]] - ) - elif model_name == "depth-anything-v2-large": - expected_slice = torch.tensor( - [[162.2751, 161.8504, 162.8788], [160.3138, 160.8050, 161.9835], [159.3812, 159.9884, 160.0768]] - ) - elif model_name == "depth-anything-v2-metric-indoor-small": - expected_slice = torch.tensor( - [[1.3349, 1.2946, 1.2801], [1.2793, 1.2337, 1.2899], [1.2629, 1.2218, 1.2476]] - ) - elif model_name == "depth-anything-v2-metric-indoor-base": - expected_slice = torch.tensor( - [[1.4601, 1.3824, 1.4904], [1.5031, 1.4349, 1.4274], [1.4570, 1.4578, 1.4200]] - ) - elif model_name == "depth-anything-v2-metric-indoor-large": - expected_slice = torch.tensor( - [[1.5040, 1.5019, 1.5218], [1.5087, 1.5195, 1.5149], [1.5437, 1.5128, 1.5252]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-small": - expected_slice = torch.tensor( - [[9.5804, 8.0339, 7.7386], [7.9890, 7.2464, 7.7149], [7.7021, 7.2330, 7.3304]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-base": - expected_slice = torch.tensor( - [[10.2916, 9.0933, 8.8622], [9.1964, 9.3393, 9.0644], [8.9618, 9.4201, 9.2262]] - ) - elif model_name == "depth-anything-v2-metric-outdoor-large": - expected_slice = torch.tensor( - [[14.0137, 13.3627, 13.1080], [13.2522, 13.3943, 13.3705], [13.0581, 13.4505, 13.3925]] - ) - else: - raise ValueError("Not supported") - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"{model_name.title()}-hf") - processor.push_to_hub(repo_id=f"{model_name.title()}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="depth-anything-small", - type=str, - choices=name_to_checkpoint.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_false", - required=False, - help="Whether to verify the logits after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py b/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py deleted file mode 100644 index 47cec7afac..0000000000 --- a/src/transformers/models/depth_anything/convert_distill_any_depth_to_hf.py +++ /dev/null @@ -1,246 +0,0 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Distill Any Depth checkpoints from the original repository. URL: -https://github.com/Westlake-AGI-Lab/Distill-Any-Depth""" - -import argparse -import re -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from safetensors.torch import load_file - -from transformers import DepthAnythingConfig, DepthAnythingForDepthEstimation, Dinov2Config, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - r"(backbone|pretrained)\.cls_token": r"backbone.embeddings.cls_token", - r"(backbone|pretrained)\.mask_token": r"backbone.embeddings.mask_token", - r"(backbone|pretrained)\.pos_embed": r"backbone.embeddings.position_embeddings", - r"(backbone|pretrained)\.patch_embed\.proj\.(weight|bias)": r"backbone.embeddings.patch_embeddings.projection.\2", - r"(backbone|pretrained)\.norm\.(weight|bias)": r"backbone.layernorm.\2", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.proj\.(weight|bias)": r"backbone.encoder.layer.\4.attention.output.dense.\5", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.ls(1|2)\.gamma": r"backbone.encoder.layer.\4.layer_scale\5.lambda1", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.mlp\.fc(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.mlp.fc\5.\6", - r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.norm(1|2)\.(weight|bias)": r"backbone.encoder.layer.\4.norm\5.\6", - r"depth_head\.projects\.(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.projection.\2", - r"depth_head\.resize_layers\.(?!2)(\d+)\.(weight|bias)": r"neck.reassemble_stage.layers.\1.resize.\2", - r"depth_head\.scratch\.layer(\d+)_rn\.weight": lambda m: f"neck.convs.{int(m[1]) - 1}.weight", - r"depth_head\.scratch\.output_conv(\d+)(?:\.(\d+))?\.(weight|bias)": lambda m: ( - f"head.conv{int(m[1]) + (int(m[2]) // 2 if m[2] else 0)}.{m[3]}" if m[1] == "2" else f"head.conv{m[1]}.{m[3]}" - ), - r"depth_head\.scratch\.refinenet(\d+)\.out_conv\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.projection.{m[2]}", - r"depth_head\.scratch\.refinenet(\d+)\.resConfUnit(\d+)\.conv(\d+)\.(weight|bias)": lambda m: f"neck.fusion_stage.layers.{3 - (int(m[1]) - 1)}.residual_layer{m[2]}.convolution{m[3]}.{m[4]}", -} - - -def get_dpt_config(model_name): - if "small" in model_name: - out_indices = [3, 6, 9, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 64 - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - out_indices = [3, 6, 9, 12] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 128 - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - out_indices = [5, 12, 18, 24] - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=out_indices, apply_layernorm=True, reshape_hidden_states=False - ) - fusion_hidden_size = 256 - neck_hidden_sizes = [256, 512, 1024, 1024] - else: - raise NotImplementedError(f"Model not supported: {model_name}") - - depth_estimation_type = "relative" - max_depth = None - - config = DepthAnythingConfig( - reassemble_hidden_size=backbone_config.hidden_size, - patch_size=backbone_config.patch_size, - backbone_config=backbone_config, - fusion_hidden_size=fusion_hidden_size, - neck_hidden_sizes=neck_hidden_sizes, - depth_estimation_type=depth_estimation_type, - max_depth=max_depth, - ) - - return config - - -def convert_key_pattern(key, mapping): - for pattern, replacement in mapping.items(): - match = re.fullmatch(pattern, key) - if match: - if callable(replacement): - return replacement(match) - return re.sub(pattern, replacement, key) - return None - - -def convert_keys(state_dict, config): - new_state_dict = {} - qkv_pattern = r"(backbone|pretrained)(\.blocks(\.\d+)?)?\.(\d+)\.attn\.qkv\.(weight|bias)" - qkv_keys = [k for k in list(state_dict.keys()) if re.match(qkv_pattern, k)] - for old_key in qkv_keys: - value = state_dict.pop(old_key) - match = re.match(qkv_pattern, old_key) - _, _, _, layer, attr = match.groups() - hidden_size = config.backbone_config.hidden_size - q = value[:hidden_size] - k = value[hidden_size : hidden_size * 2] - v = value[-hidden_size:] - - for proj, tensor in zip(["query", "key", "value"], [q, k, v]): - new_key = f"backbone.encoder.layer.{layer}.attention.attention.{proj}.{attr}" - new_state_dict[new_key] = tensor - - for old_key in list(state_dict.keys()): - value = state_dict.pop(old_key) - new_key = convert_key_pattern(old_key, ORIGINAL_TO_CONVERTED_KEY_MAPPING) - - new_state_dict[new_key] = value - - return new_state_dict - - -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - return Image.open(requests.get(url, stream=True).raw) - - -name_to_checkpoint = { - "distill-any-depth-small": "small/model.safetensors", - "distill-any-depth-base": "base/model.safetensors", - "distill-any-depth-large": "large/model.safetensors", -} - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - config = get_dpt_config(model_name) - - repo_id = "xingyang1/Distill-Any-Depth" - filepath = hf_hub_download(repo_id=repo_id, filename=name_to_checkpoint[model_name]) - state_dict = load_file(filepath) - - converted_state_dict = convert_keys(state_dict, config) - - model = DepthAnythingForDepthEstimation(config) - model.load_state_dict(converted_state_dict) - model.eval() - - processor = DPTImageProcessor( - do_resize=True, - size={"height": 518, "width": 518}, - ensure_multiple_of=14, - keep_aspect_ratio=True, - do_rescale=True, - do_normalize=True, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.229, 0.224, 0.225], - ) - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - pixel_values = processor(image, return_tensors="pt").pixel_values - - with torch.no_grad(): - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values:", predicted_depth[0, :3, :3]) - - if verify_logits: - print("Verifying logits...") - expected_shape = torch.Size([1, 518, 686]) - - if model_name == "distill-any-depth-small": - expected_slice = torch.tensor( - [[2.5653, 2.5249, 2.5570], [2.4897, 2.5235, 2.5355], [2.5255, 2.5261, 2.5422]] - ) - elif model_name == "distill-any-depth-base": - expected_slice = torch.tensor( - [[4.8976, 4.9075, 4.9403], [4.8872, 4.8906, 4.9448], [4.8712, 4.8898, 4.8838]] - ) - elif model_name == "distill-any-depth-large": - expected_slice = torch.tensor( - [[55.1067, 51.1828, 51.6803], [51.9098, 50.7529, 51.4494], [50.1745, 50.5491, 50.8818]] - ) - else: - raise ValueError("Not supported") - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"{model_name.title()}-hf") - processor.push_to_hub(repo_id=f"{model_name.title()}-hf") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model_name", - default="distill-any-depth-small", - type=str, - choices=name_to_checkpoint.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - required=False, - help="Whether to verify the logits after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py deleted file mode 100644 index 655bbdc023..0000000000 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import os -from typing import Optional - -import regex as re -import torch -from huggingface_hub import hf_hub_download - -from transformers import ( - DepthProConfig, - DepthProForDepthEstimation, - DepthProImageProcessorFast, -) - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - - # encoder - r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token", - r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings", - r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2", - r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.norm\3.\4", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.attention.(query|key|value).\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.output.dense.\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.layer_scale\3.lambda1", - r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4", - r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.layernorm.\2", - r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.neck.fuse_image_with_low_res.\1", - - # fov - r"fov.encoder.0.cls_token": r"fov_model.fov_encoder.model.embeddings.cls_token", - r"fov.encoder.0.pos_embed": r"fov_model.fov_encoder.model.embeddings.position_embeddings", - r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.fov_encoder.model.embeddings.patch_embeddings.projection.\1", - r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.norm\2.\3", - r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.attention.(query|key|value).\2", - r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.output.dense.\2", - r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1", - r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3", - r"fov.encoder.0.norm.(weight|bias)": r"fov_model.fov_encoder.model.layernorm.\1", - r"fov.downsample.0.(weight|bias)": r"fov_model.conv.\1", - r"fov.encoder.1.(weight|bias)": r"fov_model.fov_encoder.neck.\1", - r"fov.head.(\d+).(weight|bias)": r"fov_model.head.layers.\1.\2", - - # head - r"head.(\d+).(weight|bias)": r"head.layers.\1.\2", - - # upsamples - r"encoder.upsample_lowres.(weight|bias)": r"depth_pro.neck.feature_upsample.image_block.layers.0.\1", - r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: ( - f"depth_pro.neck.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" - ), - r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: ( - f"depth_pro.neck.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" - ), - - # projections between encoder and fusion - r"decoder.convs.(\d+).weight": lambda match: ( - f"depth_pro.neck.feature_projection.projections.{4-int(match.group(1))}.weight" - ), - - # fusion stage - r"decoder.fusions.([1234]).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( - f"fusion_stage.intermediate.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" - ), - r"decoder.fusions.0.resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( - f"fusion_stage.final.residual_layer{match.group(1)}.convolution{(int(match.group(2))+1)//2}.{match.group(3)}" - ), - r"decoder.fusions.([1234]).out_conv.(weight|bias)": lambda match: ( - f"fusion_stage.intermediate.{4-int(match.group(1))}.projection.{match.group(2)}" - ), - r"decoder.fusions.0.out_conv.(weight|bias)": lambda match: ( - f"fusion_stage.final.projection.{match.group(1)}" - ), - r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( - f"fusion_stage.intermediate.{4-int(match.group(1))}.deconv.{match.group(2)}" - ), -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # an empty line - continue - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def get_qkv_state_dict(key, parameter): - """ - new key which looks like this - xxxx.(q|k|v).xxx (m, n) - - is converted to - xxxx.q.xxxx (m//3, n) - xxxx.k.xxxx (m//3, n) - xxxx.v.xxxx (m//3, n) - """ - qkv_state_dict = {} - placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" - replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] - replacements_vals = torch.split( - parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 - ) - for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): - qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val - return qkv_state_dict - - -def write_model( - hf_repo_id: str, - output_dir: str, - safe_serialization: bool = True, -): - os.makedirs(output_dir, exist_ok=True) - - # ------------------------------------------------------------ - # Create and save config - # ------------------------------------------------------------ - - # create config - backbone_config = { - "model_type": "dinov2", - "num_hidden_layers": 24, - "patch_size": 16, - "hidden_size": 1024, - "num_attention_heads": 16, - "image_size": 384, - "use_mask_token": False, - } - config = DepthProConfig( - # original implementation uses same config for all 3 models - image_model_config=backbone_config, - patch_model_config=backbone_config, - fov_model_config=backbone_config, - use_fov_model=True, - ) - - # save config - config.save_pretrained(output_dir) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - # download and load state_dict from hf repo - file_path = hf_hub_download(hf_repo_id, "depth_pro.pt") - loaded = torch.load(file_path, weights_only=True) - - print("Converting model...") - all_keys = list(loaded.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - current_parameter = loaded.pop(key) - - if "qkv" in key: - qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) - state_dict.update(qkv_state_dict) - else: - state_dict[new_key] = current_parameter - - print("Loading the checkpoint in a DepthPro model.") - model = DepthProForDepthEstimation(config) - model.load_state_dict(state_dict, strict=True, assign=True) - print("Checkpoint loaded successfully.") - - print("Saving the model.") - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - model = DepthProForDepthEstimation.from_pretrained(output_dir, device_map="auto") - print("Model reloaded successfully.") - return model - - -def write_image_processor(output_dir: str): - image_processor = DepthProImageProcessorFast() - image_processor.save_pretrained(output_dir) - return image_processor - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--hf_repo_id", - default="apple/DepthPro", - help="Location of official weights from apple on HF", - ) - parser.add_argument( - "--output_dir", - default="apple_DepthPro", - help="Location to write the converted model and processor", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--push_to_hub", - action=argparse.BooleanOptionalAction, - help="Whether or not to push the converted model to the huggingface hub.", - ) - parser.add_argument( - "--hub_repo_id", - default="apple/DepthPro-hf", - help="Huggingface hub repo to write the converted model and processor", - ) - args = parser.parse_args() - - model = write_model( - hf_repo_id=args.hf_repo_id, - output_dir=args.output_dir, - safe_serialization=args.safe_serialization, - ) - - image_processor = write_image_processor( - output_dir=args.output_dir, - ) - - if args.push_to_hub: - print("Pushing to hub...") - model.push_to_hub(args.hub_repo_id) - image_processor.push_to_hub(args.hub_repo_id) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index ba98514501..0000000000 --- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,277 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETR checkpoints with timm backbone.""" - -import argparse -import json -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# here we list all keys to be renamed (original name on the left, our name on the right) -rename_keys = [] -for i in range(6): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", f"encoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", f"decoder.layers.{i}.self_attn.out_proj.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - -# convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads -rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ] -) - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def rename_backbone_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if "backbone.0.body" in key: - new_key = key.replace("backbone.0.body", "backbone.conv_encoder.model") - new_state_dict[new_key] = value - else: - new_state_dict[key] = value - - return new_state_dict - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # read in weights + bias of input projection layer of cross-attention - in_proj_weight_cross_attn = state_dict.pop( - f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" - ) - in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) of cross-attention to the state dict - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_detr_checkpoint(model_name, pytorch_dump_folder_path): - """ - Copy/paste/tweak model's weights to our DETR structure. - """ - - # load default config - config = DetrConfig() - # set backbone and dilation attributes - if "resnet101" in model_name: - config.backbone = "resnet101" - if "dc5" in model_name: - config.dilation = True - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load image processor - format = "coco_panoptic" if is_panoptic else "coco_detection" - image_processor = DetrImageProcessor(format=format) - - # prepare image - img = prepare_img() - encoding = image_processor(images=img, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - logger.info(f"Converting model {model_name}...") - - # load original model from torch hub - detr = torch.hub.load("facebookresearch/detr", model_name, pretrained=True).eval() - state_dict = detr.state_dict() - # rename keys - for src, dest in rename_keys: - if is_panoptic: - src = "detr." + src - rename_key(state_dict, src, dest) - state_dict = rename_backbone_keys(state_dict) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "detr.model." if is_panoptic else "model." - for key in state_dict.copy().keys(): - if is_panoptic: - if ( - key.startswith("detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - # finally, create HuggingFace model and load state dict - model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - # verify our conversion - original_outputs = detr(pixel_values) - outputs = model(pixel_values) - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-4) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-4) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", default="detr_resnet50", type=str, help="Name of the DETR model you'd like to convert." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - args = parser.parse_args() - convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py deleted file mode 100644 index 6ba6a0e292..0000000000 --- a/src/transformers/models/detr/convert_detr_to_pytorch.py +++ /dev/null @@ -1,385 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DETR checkpoints with native (Transformers) backbone.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_detr_config(model_name): - # initialize config - if "resnet-50" in model_name: - backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") - elif "resnet-101" in model_name: - backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-101") - else: - raise ValueError("Model name should include either resnet50 or resnet101") - - config = DetrConfig(use_timm_backbone=False, backbone_config=backbone_config) - - # set label attributes - is_panoptic = "panoptic" in model_name - if is_panoptic: - config.num_labels = 250 - else: - config.num_labels = 91 - repo_id = "huggingface/label-files" - filename = "coco-detection-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - return config, is_panoptic - - -def create_rename_keys(config): - # here we list all keys to be renamed (original name on the left, our name on the right) - rename_keys = [] - - # stem - # fmt: off - rename_keys.append(("backbone.0.body.conv1.weight", "backbone.conv_encoder.model.embedder.embedder.convolution.weight")) - rename_keys.append(("backbone.0.body.bn1.weight", "backbone.conv_encoder.model.embedder.embedder.normalization.weight")) - rename_keys.append(("backbone.0.body.bn1.bias", "backbone.conv_encoder.model.embedder.embedder.normalization.bias")) - rename_keys.append(("backbone.0.body.bn1.running_mean", "backbone.conv_encoder.model.embedder.embedder.normalization.running_mean")) - rename_keys.append(("backbone.0.body.bn1.running_var", "backbone.conv_encoder.model.embedder.embedder.normalization.running_var")) - # stages - for stage_idx in range(len(config.backbone_config.depths)): - for layer_idx in range(config.backbone_config.depths[stage_idx]): - # shortcut - if layer_idx == 0: - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.0.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.bias", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_mean", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.downsample.1.running_var", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.shortcut.normalization.running_var", - ) - ) - # 3 convs - for i in range(3): - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.conv{i+1}.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.convolution.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.weight", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.weight", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.bias", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.bias", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_mean", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_mean", - ) - ) - rename_keys.append( - ( - f"backbone.0.body.layer{stage_idx + 1}.{layer_idx}.bn{i+1}.running_var", - f"backbone.conv_encoder.model.encoder.stages.{stage_idx}.layers.{layer_idx}.layer.{i}.normalization.running_var", - ) - ) - # fmt: on - - for i in range(config.encoder_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append( - ( - f"transformer.encoder.layers.{i}.self_attn.out_proj.weight", - f"encoder.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.self_attn.out_proj.bias", f"encoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.weight", f"encoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear1.bias", f"encoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.weight", f"encoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.encoder.layers.{i}.linear2.bias", f"encoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.weight", f"encoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm1.bias", f"encoder.layers.{i}.self_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.encoder.layers.{i}.norm2.weight", f"encoder.layers.{i}.final_layer_norm.weight") - ) - rename_keys.append((f"transformer.encoder.layers.{i}.norm2.bias", f"encoder.layers.{i}.final_layer_norm.bias")) - # decoder layers: 2 times output projection, 2 feedforward neural networks and 3 layernorms - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.self_attn.out_proj.weight", - f"decoder.layers.{i}.self_attn.out_proj.weight", - ) - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.self_attn.out_proj.bias", f"decoder.layers.{i}.self_attn.out_proj.bias") - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.weight", - f"decoder.layers.{i}.encoder_attn.out_proj.weight", - ) - ) - rename_keys.append( - ( - f"transformer.decoder.layers.{i}.multihead_attn.out_proj.bias", - f"decoder.layers.{i}.encoder_attn.out_proj.bias", - ) - ) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.weight", f"decoder.layers.{i}.fc1.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear1.bias", f"decoder.layers.{i}.fc1.bias")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.weight", f"decoder.layers.{i}.fc2.weight")) - rename_keys.append((f"transformer.decoder.layers.{i}.linear2.bias", f"decoder.layers.{i}.fc2.bias")) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.weight", f"decoder.layers.{i}.self_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm1.bias", f"decoder.layers.{i}.self_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.weight", f"decoder.layers.{i}.encoder_attn_layer_norm.weight") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm2.bias", f"decoder.layers.{i}.encoder_attn_layer_norm.bias") - ) - rename_keys.append( - (f"transformer.decoder.layers.{i}.norm3.weight", f"decoder.layers.{i}.final_layer_norm.weight") - ) - rename_keys.append((f"transformer.decoder.layers.{i}.norm3.bias", f"decoder.layers.{i}.final_layer_norm.bias")) - - # convolutional projection + query embeddings + layernorm of decoder + class and bounding box heads - rename_keys.extend( - [ - ("input_proj.weight", "input_projection.weight"), - ("input_proj.bias", "input_projection.bias"), - ("query_embed.weight", "query_position_embeddings.weight"), - ("transformer.decoder.norm.weight", "decoder.layernorm.weight"), - ("transformer.decoder.norm.bias", "decoder.layernorm.bias"), - ("class_embed.weight", "class_labels_classifier.weight"), - ("class_embed.bias", "class_labels_classifier.bias"), - ("bbox_embed.layers.0.weight", "bbox_predictor.layers.0.weight"), - ("bbox_embed.layers.0.bias", "bbox_predictor.layers.0.bias"), - ("bbox_embed.layers.1.weight", "bbox_predictor.layers.1.weight"), - ("bbox_embed.layers.1.bias", "bbox_predictor.layers.1.bias"), - ("bbox_embed.layers.2.weight", "bbox_predictor.layers.2.weight"), - ("bbox_embed.layers.2.bias", "bbox_predictor.layers.2.bias"), - ] - ) - - return rename_keys - - -def rename_key(state_dict, old, new): - val = state_dict.pop(old) - state_dict[new] = val - - -def read_in_q_k_v(state_dict, is_panoptic=False): - prefix = "" - if is_panoptic: - prefix = "detr." - - # first: transformer encoder - for i in range(6): - # read in weights + bias of input projection layer (in PyTorch's MultiHeadAttention, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.encoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # next: transformer decoder (which is a bit more complex because it also includes cross-attention) - for i in range(6): - # read in weights + bias of input projection layer of self-attention - in_proj_weight = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"decoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :] - state_dict[f"decoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :] - state_dict[f"decoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :] - state_dict[f"decoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:] - # read in weights + bias of input projection layer of cross-attention - in_proj_weight_cross_attn = state_dict.pop( - f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_weight" - ) - in_proj_bias_cross_attn = state_dict.pop(f"{prefix}transformer.decoder.layers.{i}.multihead_attn.in_proj_bias") - # next, add query, keys and values (in that order) of cross-attention to the state dict - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :] - state_dict[f"decoder.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :] - state_dict[f"decoder.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :] - state_dict[f"decoder.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - - return im - - -@torch.no_grad() -def convert_detr_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DETR structure. - """ - - # load default config - config, is_panoptic = get_detr_config(model_name) - - # load original model from torch hub - model_name_to_original_name = { - "detr-resnet-50": "detr_resnet50", - "detr-resnet-101": "detr_resnet101", - } - logger.info(f"Converting model {model_name}...") - detr = torch.hub.load("facebookresearch/detr", model_name_to_original_name[model_name], pretrained=True).eval() - state_dict = detr.state_dict() - # rename keys - for src, dest in create_rename_keys(config): - if is_panoptic: - src = "detr." + src - rename_key(state_dict, src, dest) - # query, key and value matrices need special treatment - read_in_q_k_v(state_dict, is_panoptic=is_panoptic) - # important: we need to prepend a prefix to each of the base model keys as the head models use different attributes for them - prefix = "detr.model." if is_panoptic else "model." - for key in state_dict.copy().keys(): - if is_panoptic: - if ( - key.startswith("detr") - and not key.startswith("class_labels_classifier") - and not key.startswith("bbox_predictor") - ): - val = state_dict.pop(key) - state_dict["detr.model" + key[4:]] = val - elif "class_labels_classifier" in key or "bbox_predictor" in key: - val = state_dict.pop(key) - state_dict["detr." + key] = val - elif key.startswith("bbox_attention") or key.startswith("mask_head"): - continue - else: - val = state_dict.pop(key) - state_dict[prefix + key] = val - else: - if not key.startswith("class_labels_classifier") and not key.startswith("bbox_predictor"): - val = state_dict.pop(key) - state_dict[prefix + key] = val - - # finally, create HuggingFace model and load state dict - model = DetrForSegmentation(config) if is_panoptic else DetrForObjectDetection(config) - model.load_state_dict(state_dict) - model.eval() - - # verify our conversion on an image - format = "coco_panoptic" if is_panoptic else "coco_detection" - processor = DetrImageProcessor(format=format) - - encoding = processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - - original_outputs = detr(pixel_values) - outputs = model(pixel_values) - - assert torch.allclose(outputs.logits, original_outputs["pred_logits"], atol=1e-3) - assert torch.allclose(outputs.pred_boxes, original_outputs["pred_boxes"], atol=1e-3) - if is_panoptic: - assert torch.allclose(outputs.pred_masks, original_outputs["pred_masks"], atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - # Save model and image processor - logger.info(f"Saving PyTorch model and image processor to {pytorch_dump_folder_path}...") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Upload model and image processor to the hub - logger.info("Uploading PyTorch model and image processor to the hub...") - model.push_to_hub(f"nielsr/{model_name}") - processor.push_to_hub(f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--model_name", - default="detr-resnet-50", - type=str, - choices=["detr-resnet-50", "detr-resnet-101"], - help="Name of the DETR model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to the hub or not.") - args = parser.parse_args() - convert_detr_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 03f38084cf..0000000000 --- a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers.utils import WEIGHTS_NAME - - -DIALOGPT_MODELS = ["small", "medium", "large"] - -OLD_KEY = "lm_head.decoder.weight" -NEW_KEY = "lm_head.weight" - - -def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): - d = torch.load(checkpoint_path, weights_only=True) - d[NEW_KEY] = d.pop(OLD_KEY) - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--dialogpt_path", default=".", type=str) - args = parser.parse_args() - for MODEL in DIALOGPT_MODELS: - checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") - pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" - convert_dialogpt_checkpoint( - checkpoint_path, - pytorch_dump_folder_path, - ) diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py deleted file mode 100644 index d716191b2f..0000000000 --- a/src/transformers/models/dinov2/convert_dinov2_to_hf.py +++ /dev/null @@ -1,285 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov2/tree/main -""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -import torch.nn as nn -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dinov2_config(model_name, image_classifier=False): - config = Dinov2Config(image_size=518, patch_size=14) - - # size of the architecture - if "vits" in model_name: - config.hidden_size = 384 - config.num_attention_heads = 6 - elif "vitb" in model_name: - pass - elif "vitl" in model_name: - config.hidden_size = 1024 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - elif "vitg" in model_name: - config.use_swiglu_ffn = True - config.hidden_size = 1536 - config.num_hidden_layers = 40 - config.num_attention_heads = 24 - else: - raise ValueError("Model not supported") - - if image_classifier: - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - config.id2label = {int(k): v for k, v in config.id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("mask_token", "embeddings.mask_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -@torch.no_grad() -def convert_dinov2_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DINOv2 structure. - """ - - # define default Dinov2 configuration - image_classifier = "1layer" in model_name - config = get_dinov2_config(model_name, image_classifier=image_classifier) - - # load original model from torch hub - original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) - original_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config) - - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2ForImageClassification(config).eval() - model.dinov2.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", - "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", - "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", - "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2Model(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension - - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - - assert torch.allclose(original_pixel_values, pixel_values) - - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14": "dinov2-small", - "dinov2_vitb14": "dinov2-base", - "dinov2_vitl14": "dinov2-large", - "dinov2_vitg14": "dinov2-giant", - "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", - "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", - "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", - "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"facebook/{name}") - processor.push_to_hub(f"facebook/{name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dinov2_vitb14", - type=str, - choices=[ - "dinov2_vits14", - "dinov2_vitb14", - "dinov2_vitl14", - "dinov2_vitg14", - "dinov2_vits14_1layer", - "dinov2_vitb14_1layer", - "dinov2_vitl14_1layer", - "dinov2_vitg14_1layer", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_dinov2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py b/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py deleted file mode 100644 index 0ff2697f74..0000000000 --- a/src/transformers/models/dinov2_with_registers/convert_dinov2_with_registers_to_hf.py +++ /dev/null @@ -1,291 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 with Registers checkpoints from the original repository. - -URL: https://github.com/facebookresearch/dinov2/tree/main -""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -import torch.nn as nn -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import ( - BitImageProcessor, - Dinov2WithRegistersConfig, - Dinov2WithRegistersForImageClassification, - Dinov2WithRegistersModel, -) -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dinov2_with_registers_config(model_name, image_classifier=False): - config = Dinov2WithRegistersConfig(image_size=518, patch_size=14) - - # size of the architecture - if "vits" in model_name: - config.hidden_size = 384 - config.num_attention_heads = 6 - elif "vitb" in model_name: - pass - elif "vitl" in model_name: - config.hidden_size = 1024 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - elif "vitg" in model_name: - config.use_swiglu_ffn = True - config.hidden_size = 1536 - config.num_hidden_layers = 40 - config.num_attention_heads = 24 - else: - raise ValueError("Model not supported") - - if image_classifier: - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - config.id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - config.id2label = {int(k): v for k, v in config.id2label.items()} - - return config - - -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("mask_token", "embeddings.mask_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("register_tokens", "embeddings.register_tokens")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -@torch.no_grad() -def convert_dinov2_with_registers_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our Dinov2WithRegisters structure. - """ - - # define default Dinov2WithRegisters configuration - image_classifier = "1layer" in model_name - config = get_dinov2_with_registers_config(model_name, image_classifier=image_classifier) - - # load original model from torch hub - original_model = torch.hub.load("facebookresearch/dinov2", model_name.replace("_1layer", "")) - original_model.eval() - - # load state_dict of original model, remove and rename some keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config) - - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2WithRegistersForImageClassification(config).eval() - model.dinov2_with_registers.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_linear_head.pth", - "dinov2_vitb14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_linear_head.pth", - "dinov2_vitl14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_linear_head.pth", - "dinov2_vitg14_reg_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2WithRegistersModel(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension - - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - - assert torch.allclose(original_pixel_values, pixel_values) - - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14_reg": "dinov2-with-registers-small", - "dinov2_vitb14_reg": "dinov2-with-registers-base", - "dinov2_vitl14_reg": "dinov2-with-registers-large", - "dinov2_vitg14_reg": "dinov2-with-registers-giant", - "dinov2_vits14_reg_1layer": "dinov2-with-registers-small-imagenet1k-1-layer", - "dinov2_vitb14_reg_1layer": "dinov2-with-registers-base-imagenet1k-1-layer", - "dinov2_vitl14_reg_1layer": "dinov2-with-registers-large-imagenet1k-1-layer", - "dinov2_vitg14_reg_1layer": "dinov2-with-registers-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"nielsr/{name}") - processor.push_to_hub(f"nielsr/{name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dinov2_vits14_reg", - type=str, - choices=[ - "dinov2_vits14_reg", - "dinov2_vitb14_reg", - "dinov2_vitl14_reg", - "dinov2_vitg14_reg", - "dinov2_vits14_reg_1layer", - "dinov2_vitb14_reg_1layer", - "dinov2_vitl14_reg_1layer", - "dinov2_vitg14_reg_1layer", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - - args = parser.parse_args() - convert_dinov2_with_registers_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py deleted file mode 100644 index 40c5b22e3b..0000000000 --- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py +++ /dev/null @@ -1,230 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DiT checkpoints from the unilm repository.""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import BeitConfig, BeitForImageClassification, BeitForMaskedImageModeling, BeitImageProcessor -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, has_lm_head=False, is_semantic=False): - prefix = "backbone." if is_semantic else "" - - rename_keys = [] - for i in range(config.num_hidden_layers): - # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight") - ) - rename_keys.append( - (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias") - ) - rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias")) - - # projection layer + position embeddings - rename_keys.extend( - [ - (f"{prefix}cls_token", "beit.embeddings.cls_token"), - (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"), - (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"), - (f"{prefix}pos_embed", "beit.embeddings.position_embeddings"), - ] - ) - - if has_lm_head: - # mask token + layernorm - rename_keys.extend( - [ - ("mask_token", "beit.embeddings.mask_token"), - ("norm.weight", "layernorm.weight"), - ("norm.bias", "layernorm.bias"), - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("fc_norm.weight", "beit.pooler.layernorm.weight"), - ("fc_norm.bias", "beit.pooler.layernorm.bias"), - ("head.weight", "classifier.weight"), - ("head.bias", "classifier.bias"), - ] - ) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False): - for i in range(config.num_hidden_layers): - prefix = "backbone." if is_semantic else "" - # queries, keys and values - in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") - - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ - : config.hidden_size, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - # gamma_1 and gamma_2 - # we call them lambda because otherwise they are renamed when using .from_pretrained - gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") - gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") - - state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1 - state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2 - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dit_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our BEiT structure. - """ - - # define default BEiT configuration - has_lm_head = False if "rvlcdip" in checkpoint_url else True - config = BeitConfig(use_absolute_position_embeddings=True, use_mask_token=has_lm_head) - - # size of the architecture - if "large" in checkpoint_url or "dit-l" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - - # labels - if "rvlcdip" in checkpoint_url: - config.num_labels = 16 - repo_id = "huggingface/label-files" - filename = "rvlcdip-id2label.json" - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - - # load state_dict of original model, remove and rename some keys - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - - rename_keys = create_rename_keys(config, has_lm_head=has_lm_head) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head) - - # load HuggingFace model - model = BeitForMaskedImageModeling(config) if has_lm_head else BeitForImageClassification(config) - model.eval() - model.load_state_dict(state_dict) - - # Check outputs on an image - image_processor = BeitImageProcessor( - size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False - ) - image = prepare_img() - - encoding = image_processor(images=image, return_tensors="pt") - pixel_values = encoding["pixel_values"] - - outputs = model(pixel_values) - logits = outputs.logits - - # verify logits - expected_shape = [1, 16] if "rvlcdip" in checkpoint_url else [1, 196, 8192] - assert logits.shape == torch.Size(expected_shape), "Shape of logits not as expected" - - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - if has_lm_head: - model_name = "dit-base" if "base" in checkpoint_url else "dit-large" - else: - model_name = "dit-base-finetuned-rvlcdip" if "dit-b" in checkpoint_url else "dit-large-finetuned-rvlcdip" - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_url", - default="https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth", - type=str, - help="URL to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - args = parser.parse_args() - convert_dit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py deleted file mode 100644 index f6f14f6d08..0000000000 --- a/src/transformers/models/donut/convert_donut_to_pytorch.py +++ /dev/null @@ -1,234 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Donut checkpoints using the original `donut-python` library. URL: https://github.com/clovaai/donut""" - -import argparse - -import torch -from datasets import load_dataset -from donut import DonutModel - -from transformers import ( - DonutImageProcessor, - DonutProcessor, - DonutSwinConfig, - DonutSwinModel, - MBartConfig, - MBartForCausalLM, - VisionEncoderDecoderModel, - XLMRobertaTokenizerFast, -) - - -def get_configs(model): - original_config = model.config - - encoder_config = DonutSwinConfig( - image_size=original_config.input_size, - patch_size=4, - depths=original_config.encoder_layer, - num_heads=[4, 8, 16, 32], - window_size=original_config.window_size, - embed_dim=128, - ) - decoder_config = MBartConfig( - is_decoder=True, - is_encoder_decoder=False, - add_cross_attention=True, - decoder_layers=original_config.decoder_layer, - max_position_embeddings=original_config.max_position_embeddings, - vocab_size=len( - model.decoder.tokenizer - ), # several special tokens are added to the vocab of XLMRobertaTokenizer, see repo on the hub (added_tokens.json) - scale_embedding=True, - add_final_layer_norm=True, - ) - - return encoder_config, decoder_config - - -def rename_key(name): - if "encoder.model" in name: - name = name.replace("encoder.model", "encoder") - if "decoder.model" in name: - name = name.replace("decoder.model", "decoder") - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - if "patch_embed.norm" in name: - name = name.replace("patch_embed.norm", "embeddings.norm") - if name.startswith("encoder"): - if "layers" in name: - name = "encoder." + name - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "attn" in name and "mask" not in name: - name = name.replace("attn", "attention.self") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - - if name == "encoder.norm.weight": - name = "encoder.layernorm.weight" - if name == "encoder.norm.bias": - name = "encoder.layernorm.bias" - - return name - - -def convert_state_dict(orig_state_dict, model): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - key_split = key.split(".") - layer_num = int(key_split[3]) - block_num = int(key_split[5]) - dim = model.encoder.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size - - if "weight" in key: - orig_state_dict[ - f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight" - ] = val[:dim, :] - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = ( - val[dim : dim * 2, :] - ) - orig_state_dict[ - f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight" - ] = val[-dim:, :] - else: - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = ( - val[:dim] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = ( - val[dim : dim * 2] - ) - orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = ( - val[-dim:] - ) - elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]: - # HuggingFace implementation doesn't use attn_mask buffer - # and model doesn't use final LayerNorms for the encoder - pass - else: - orig_state_dict[rename_key(key)] = val - - return orig_state_dict - - -def convert_donut_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - # load original model - original_model = DonutModel.from_pretrained(model_name).eval() - - # load HuggingFace model - encoder_config, decoder_config = get_configs(original_model) - encoder = DonutSwinModel(encoder_config) - decoder = MBartForCausalLM(decoder_config) - model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder) - model.eval() - - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict(state_dict, model) - model.load_state_dict(new_state_dict) - - # verify results on scanned document - dataset = load_dataset("hf-internal-testing/example-documents") # no-script - image = dataset["test"][0]["image"].convert("RGB") - - tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name, from_slow=True) - image_processor = DonutImageProcessor( - do_align_long_axis=original_model.config.align_long_axis, size=original_model.config.input_size[::-1] - ) - processor = DonutProcessor(image_processor, tokenizer) - pixel_values = processor(image, return_tensors="pt").pixel_values - - if model_name == "naver-clova-ix/donut-base-finetuned-docvqa": - task_prompt = "{user_input}" - question = "When is the coffee break?" - task_prompt = task_prompt.replace("{user_input}", question) - elif model_name == "naver-clova-ix/donut-base-finetuned-rvlcdip": - task_prompt = "" - elif model_name in [ - "naver-clova-ix/donut-base-finetuned-cord-v1", - "naver-clova-ix/donut-base-finetuned-cord-v1-2560", - ]: - task_prompt = "" - elif model_name == "naver-clova-ix/donut-base-finetuned-cord-v2": - task_prompt = "s_cord-v2>" - elif model_name == "naver-clova-ix/donut-base-finetuned-zhtrainticket": - task_prompt = "" - elif model_name in ["naver-clova-ix/donut-proto", "naver-clova-ix/donut-base"]: - # use a random prompt - task_prompt = "hello world" - else: - raise ValueError("Model name not supported") - prompt_tensors = original_model.decoder.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")[ - "input_ids" - ] - - original_patch_embed = original_model.encoder.model.patch_embed(pixel_values) - patch_embeddings, _ = model.encoder.embeddings(pixel_values) - assert torch.allclose(original_patch_embed, patch_embeddings, atol=1e-3) - - # verify encoder hidden states - original_last_hidden_state = original_model.encoder(pixel_values) - last_hidden_state = model.encoder(pixel_values).last_hidden_state - assert torch.allclose(original_last_hidden_state, last_hidden_state, atol=1e-2) - - # verify decoder hidden states - original_logits = original_model(pixel_values, prompt_tensors, None).logits - logits = model(pixel_values, decoder_input_ids=prompt_tensors).logits - assert torch.allclose(original_logits, logits, atol=1e-3) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") - processor.push_to_hub("nielsr/" + model_name.split("/")[-1], commit_message="Update model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="naver-clova-ix/donut-base-finetuned-docvqa", - required=False, - type=str, - help="Name of the original model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - required=False, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the đŸ€— hub.", - ) - - args = parser.parse_args() - convert_donut_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py b/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py deleted file mode 100644 index 5151c0972a..0000000000 --- a/src/transformers/models/dpr/convert_dpr_original_checkpoint_to_pytorch.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import collections -from pathlib import Path - -import torch -from torch.serialization import default_restore_location - -from transformers import BertConfig, DPRConfig, DPRContextEncoder, DPRQuestionEncoder, DPRReader - - -CheckpointState = collections.namedtuple( - "CheckpointState", ["model_dict", "optimizer_dict", "scheduler_dict", "offset", "epoch", "encoder_params"] -) - - -def load_states_from_checkpoint(model_file: str) -> CheckpointState: - print(f"Reading saved model from {model_file}") - state_dict = torch.load( - model_file, map_location=lambda s, l: default_restore_location(s, "cpu"), weights_only=True - ) - return CheckpointState(**state_dict) - - -class DPRState: - def __init__(self, src_file: Path): - self.src_file = src_file - - def load_dpr_model(self): - raise NotImplementedError - - @staticmethod - def from_type(comp_type: str, *args, **kwargs) -> "DPRState": - if comp_type.startswith("c"): - return DPRContextEncoderState(*args, **kwargs) - if comp_type.startswith("q"): - return DPRQuestionEncoderState(*args, **kwargs) - if comp_type.startswith("r"): - return DPRReaderState(*args, **kwargs) - else: - raise ValueError("Component type must be either 'ctx_encoder', 'question_encoder' or 'reader'.") - - -class DPRContextEncoderState(DPRState): - def load_dpr_model(self): - model = DPRContextEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR biencoder from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - encoder, prefix = model.ctx_encoder, "ctx_model." - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = {"bert_model.embeddings.position_ids": model.ctx_encoder.bert_model.embeddings.position_ids} - for key, value in saved_state.model_dict.items(): - if key.startswith(prefix): - key = key[len(prefix) :] - if not key.startswith("encode_proj."): - key = "bert_model." + key - state_dict[key] = value - encoder.load_state_dict(state_dict) - return model - - -class DPRQuestionEncoderState(DPRState): - def load_dpr_model(self): - model = DPRQuestionEncoder(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR biencoder from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - encoder, prefix = model.question_encoder, "question_model." - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = {"bert_model.embeddings.position_ids": model.question_encoder.bert_model.embeddings.position_ids} - for key, value in saved_state.model_dict.items(): - if key.startswith(prefix): - key = key[len(prefix) :] - if not key.startswith("encode_proj."): - key = "bert_model." + key - state_dict[key] = value - encoder.load_state_dict(state_dict) - return model - - -class DPRReaderState(DPRState): - def load_dpr_model(self): - model = DPRReader(DPRConfig(**BertConfig.get_config_dict("google-bert/bert-base-uncased")[0])) - print(f"Loading DPR reader from {self.src_file}") - saved_state = load_states_from_checkpoint(self.src_file) - # Fix changes from https://github.com/huggingface/transformers/commit/614fef1691edb806de976756d4948ecbcd0c0ca3 - state_dict = { - "encoder.bert_model.embeddings.position_ids": model.span_predictor.encoder.bert_model.embeddings.position_ids - } - for key, value in saved_state.model_dict.items(): - if key.startswith("encoder.") and not key.startswith("encoder.encode_proj"): - key = "encoder.bert_model." + key[len("encoder.") :] - state_dict[key] = value - model.span_predictor.load_state_dict(state_dict) - return model - - -def convert(comp_type: str, src_file: Path, dest_dir: Path): - dest_dir = Path(dest_dir) - dest_dir.mkdir(exist_ok=True) - - dpr_state = DPRState.from_type(comp_type, src_file=src_file) - model = dpr_state.load_dpr_model() - model.save_pretrained(dest_dir) - model.from_pretrained(dest_dir) # sanity check - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--type", type=str, help="Type of the component to convert: 'ctx_encoder', 'question_encoder' or 'reader'." - ) - parser.add_argument( - "--src", - type=str, - help=( - "Path to the dpr checkpoint file. They can be downloaded from the official DPR repo" - " https://github.com/facebookresearch/DPR. Note that in the official repo, both encoders are stored in the" - " 'retriever' checkpoints." - ), - ) - parser.add_argument("--dest", type=str, default=None, help="Path to the output PyTorch model directory.") - args = parser.parse_args() - - src_file = Path(args.src) - dest_dir = f"converted-{src_file.name}" if args.dest is None else args.dest - dest_dir = Path(dest_dir) - assert src_file.exists() - assert args.type is not None, ( - "Please specify the component type of the DPR model to convert: 'ctx_encoder', 'question_encoder' or 'reader'." - ) - convert(args.type, src_file, dest_dir) diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py deleted file mode 100644 index 21aa2b4897..0000000000 --- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py +++ /dev/null @@ -1,383 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DINOv2 + DPT checkpoints from the original repository. URL: -https://github.com/facebookresearch/dinov2/tree/main""" - -import argparse -import itertools -import math -from pathlib import Path - -import requests -import torch -from PIL import Image -from torchvision import transforms - -from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "small" in model_name: - # equivalent to stage 3, stage 6, stage 9, stage 12 - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-small", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [48, 96, 192, 384] - elif "base" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-base", out_indices=[3, 6, 9, 12], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [96, 192, 384, 768] - elif "large" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-large", out_indices=[5, 12, 18, 24], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [128, 256, 512, 1024] - elif "giant" in model_name: - backbone_config = Dinov2Config.from_pretrained( - "facebook/dinov2-giant", out_indices=[10, 20, 30, 40], apply_layernorm=False, reshape_hidden_states=False - ) - neck_hidden_sizes = [192, 384, 768, 1536] - else: - raise NotImplementedError("To do") - - config = DPTConfig( - backbone_config=backbone_config, - neck_hidden_sizes=neck_hidden_sizes, - use_bias_in_fusion_residual=False, - add_projection=True, - ) - - return config - - -# here we list all DPT keys to be renamed (original name on the left, our name on the right) -def create_rename_keys_dpt(config): - rename_keys = [] - - # fmt: off - # activation postprocessing (projections, readout projections + resize blocks) - for i in range(4): - rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.projects.{i}.conv.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.readout_projects.{i}.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias")) - - if i != 2: - rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"decode_head.reassemble_blocks.resize_layers.{i}.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # fusion layers - for i in range(4): - rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.weight", f"neck.fusion_stage.layers.{i}.projection.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.project.conv.bias", f"neck.fusion_stage.layers.{i}.projection.bias")) - if i != 0: - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution1.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit1.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer1.convolution2.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv1.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution1.weight")) - rename_keys.append((f"decode_head.fusion_blocks.{i}.res_conv_unit2.conv2.conv.weight", f"neck.fusion_stage.layers.{i}.residual_layer2.convolution2.weight")) - - # neck convolutions - for i in range(4): - rename_keys.append((f"decode_head.convs.{i}.conv.weight", f"neck.convs.{i}.weight")) - - # head - rename_keys.append(("decode_head.project.conv.weight", "head.projection.weight")) - rename_keys.append(("decode_head.project.conv.bias", "head.projection.bias")) - - for i in range(0, 5, 2): - rename_keys.append((f"decode_head.conv_depth.head.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"decode_head.conv_depth.head.{i}.bias", f"head.head.{i}.bias")) - # fmt: on - - return rename_keys - - -# here we list all backbone keys to be renamed (original name on the left, our name on the right) -def create_rename_keys_backbone(config): - rename_keys = [] - - # fmt: off - # patch embedding layer - rename_keys.append(("cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("mask_token", "backbone.embeddings.mask_token")) - rename_keys.append(("pos_embed", "backbone.embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transformer encoder - for i in range(config.backbone_config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.norm2.bias")) - # MLP - if config.backbone_config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"backbone.encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"backbone.encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"backbone.encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"backbone.encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"backbone.encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"backbone.encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - # fmt: on - - rename_keys.append(("norm.weight", "backbone.layernorm.weight")) - rename_keys.append(("norm.bias", "backbone.layernorm.bias")) - - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - hidden_size = config.backbone_config.hidden_size - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[:hidden_size] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-hidden_size:] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "https://dl.fbaipublicfiles.com/dinov2/images/example.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -name_to_url = { - "dpt-dinov2-small-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_nyu_dpt_head.pth", - "dpt-dinov2-small-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_kitti_dpt_head.pth", - "dpt-dinov2-base-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_nyu_dpt_head.pth", - "dpt-dinov2-base-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_kitti_dpt_head.pth", - "dpt-dinov2-large-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_nyu_dpt_head.pth", - "dpt-dinov2-large-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_kitti_dpt_head.pth", - "dpt-dinov2-giant-nyu": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_nyu_dpt_head.pth", - "dpt-dinov2-giant-kitti": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_kitti_dpt_head.pth", -} - - -def get_original_pixel_values(image): - class CenterPadding: - def __init__(self, multiple): - super().__init__() - self.multiple = multiple - - def _get_pad(self, size): - new_size = math.ceil(size / self.multiple) * self.multiple - pad_size = new_size - size - pad_size_left = pad_size // 2 - pad_size_right = pad_size - pad_size_left - return pad_size_left, pad_size_right - - def __call__(self, img): - pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in img.shape[-2:][::-1])) - output = torch.nn.functional.pad(img, pads) - return output - - def __repr__(self): - return self.__class__.__name__ + "()" - - def make_depth_transform() -> transforms.Compose: - return transforms.Compose( - [ - transforms.ToTensor(), - lambda x: 255.0 * x[:3], # Discard alpha component and scale by 255 - transforms.Normalize( - mean=(123.675, 116.28, 103.53), - std=(58.395, 57.12, 57.375), - ), - CenterPadding(multiple=14), - ] - ) - - transform = make_depth_transform() - original_pixel_values = transform(image).unsqueeze(0) - - return original_pixel_values - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config = get_dpt_config(model_name) - - # load original DPT state_dict from URL - print("URL:", checkpoint_url) - dpt_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["state_dict"] - # rename keys - rename_keys = create_rename_keys_dpt(config) - for src, dest in rename_keys: - rename_key(dpt_state_dict, src, dest) - - # load original backbone state_dict from URL - if "small" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14") - elif "base" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14") - elif "large" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitl14") - elif "giant" in model_name: - original_model = torch.hub.load("facebookresearch/dinov2", "dinov2_vitg14") - else: - raise NotImplementedError("To do") - original_model.eval() - backbone_state_dict = original_model.state_dict() - - # rename keys - rename_keys = create_rename_keys_backbone(config) - for src, dest in rename_keys: - rename_key(backbone_state_dict, src, dest) - - # read in qkv matrices - read_in_q_k_v(backbone_state_dict, config) - - for key, val in backbone_state_dict.copy().items(): - val = backbone_state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - backbone_state_dict[key] = val - - # merge state_dicts - state_dict = {**backbone_state_dict, **dpt_state_dict} - - # load HuggingFace model - model = DPTForDepthEstimation(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - assert missing_keys == [ - "neck.fusion_stage.layers.0.residual_layer1.convolution1.weight", - "neck.fusion_stage.layers.0.residual_layer1.convolution2.weight", - ] - model.eval() - - # Verify image processor - processor = DPTImageProcessor( - do_resize=False, - do_rescale=False, - do_pad=True, - size_divisor=14, - do_normalize=True, - image_mean=(123.675, 116.28, 103.53), - image_std=(58.395, 57.12, 57.375), - ) - - image = prepare_img() - pixel_values = processor(image, return_tensors="pt").pixel_values.float() - original_pixel_values = get_original_pixel_values(image) - - assert torch.allclose(pixel_values, original_pixel_values) - - # Verify forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - if verify_logits: - if model_name == "dpt-dinov2-small-nyu": - expected_shape = torch.Size([1, 576, 736]) - expected_slice = torch.tensor( - [[3.3576, 3.4741, 3.4345], [3.4324, 3.5012, 3.2775], [3.2560, 3.3563, 3.2354]] - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-5) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"facebook/{model_name}") - processor.push_to_hub(repo_id=f"facebook/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-dinov2-small-nyu", - type=str, - choices=name_to_url.keys(), - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - required=False, - help="Path to the output PyTorch model directory.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.verify_logits) diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py deleted file mode 100644 index c4ff8a3eb7..0000000000 --- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS""" - -import argparse -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - hidden_size = 768 - num_hidden_layers = 12 - num_attention_heads = 12 - intermediate_size = 3072 - out_features = ["stage3", "stage6", "stage9", "stage12"] # beit-base-384 uses [2, 5, 8, 11] - - if "large" in model_name: - hidden_size = 1024 - num_hidden_layers = 24 - num_attention_heads = 16 - intermediate_size = 4096 - out_features = ["stage6", "stage12", "stage18", "stage24"] # beit-large-512 uses [5, 11, 17, 23] - - if "512" in model_name: - image_size = 512 - elif "384" in model_name: - image_size = 384 - else: - raise ValueError("Model not supported") - - backbone_config = BeitConfig( - image_size=image_size, - num_hidden_layers=num_hidden_layers, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_attention_heads=num_attention_heads, - use_relative_position_bias=True, - reshape_hidden_states=False, - out_features=out_features, - ) - - neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768] - config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes) - - return config, image_size - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token")) - rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - - # Transformer encoder - for i in range(config.backbone_config.num_hidden_layers): - rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1")) - rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table")) - rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index")) - - # activation postprocessing (readout projections + resize blocks) - for i in range(4): - rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias")) - - rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias")) - - if i != 2: - rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight")) - rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias")) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - for i in range(0, 5, 2): - rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias")) - - return rename_keys - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - hidden_size = config.backbone_config.hidden_size - for i in range(config.backbone_config.num_hidden_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight") - q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias - state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - name_to_url = { - "dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt", - "dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt", - "dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt", - } - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config, image_size = get_dpt_config(model_name) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForDepthEstimation(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - assert missing_keys == [] - # assert unexpected_keys == ["pretrained.model.fc_norm.weight", "pretrained.model.fc_norm.bias"] - model.eval() - - # Check outputs on an image - # We set `keep_aspect_ratio=False` as our current BEiT does not support arbitrary window sizes - processor = DPTImageProcessor( - size={"height": image_size, "width": image_size}, keep_aspect_ratio=False, ensure_multiple_of=32 - ) - - image = prepare_img() - pixel_values = processor(image, return_tensors="pt").pixel_values - - print("First values of pixel values:", pixel_values[0, 0, :3, :3]) - print("Mean of pixel values:", pixel_values.mean().item()) - print("Shape of pixel values:", pixel_values.shape) - - import requests - from PIL import Image - from torchvision import transforms - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - transforms = transforms.Compose( - [ - transforms.Resize((image_size, image_size)), - transforms.ToTensor(), - ] - ) - pixel_values = transforms(image).unsqueeze(0) - - # forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - # TODO there's still a small difference with the original logits - if model_name == "dpt-beit-large-512": - # OK, checked - expected_shape = torch.Size([1, 512, 512]) - expected_slice = torch.tensor( - [[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]] - ) - elif model_name == "dpt-beit-large-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]], - ) - elif model_name == "dpt-beit-base-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]], - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"nielsr/{model_name}") - processor.push_to_hub(repo_id=f"nielsr/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-beit-large-512", - type=str, - choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py deleted file mode 100644 index ceae9b8471..0000000000 --- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py +++ /dev/null @@ -1,315 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(checkpoint_url): - config = DPTConfig(embedding_type="hybrid") - - if "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - config.backbone_out_indices = [5, 11, 17, 23] - config.neck_hidden_sizes = [256, 512, 1024, 1024] - expected_shape = (1, 384, 384) - - if "nyu" in checkpoint_url or "midas" in checkpoint_url: - config.hidden_size = 768 - config.reassemble_factors = [1, 1, 1, 0.5] - config.neck_hidden_sizes = [256, 512, 768, 768] - config.num_labels = 150 - config.patch_size = 16 - expected_shape = (1, 384, 384) - config.use_batch_norm_in_fusion_residual = False - config.readout_type = "project" - - if "ade" in checkpoint_url: - config.use_batch_norm_in_fusion_residual = True - config.hidden_size = 768 - config.reassemble_stage = [1, 1, 1, 0.5] - config.num_labels = 150 - config.patch_size = 16 - repo_id = "huggingface/label-files" - filename = "ade20k-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - expected_shape = [1, 150, 480, 480] - - return config, expected_shape - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if ( - "pretrained.model" in name - and "cls_token" not in name - and "pos_embed" not in name - and "patch_embed" not in name - ): - name = name.replace("pretrained.model", "dpt.encoder") - if "pretrained.model" in name: - name = name.replace("pretrained.model", "dpt.embeddings") - if "patch_embed" in name: - name = name.replace("patch_embed", "") - if "pos_embed" in name: - name = name.replace("pos_embed", "position_embeddings") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "proj" in name and "project" not in name: - name = name.replace("proj", "projection") - if "blocks" in name: - name = name.replace("blocks", "layer") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - if "norm1" in name and "backbone" not in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name and "backbone" not in name: - name = name.replace("norm2", "layernorm_after") - if "scratch.output_conv" in name: - name = name.replace("scratch.output_conv", "head") - if "scratch" in name: - name = name.replace("scratch", "neck") - if "layer1_rn" in name: - name = name.replace("layer1_rn", "convs.0") - if "layer2_rn" in name: - name = name.replace("layer2_rn", "convs.1") - if "layer3_rn" in name: - name = name.replace("layer3_rn", "convs.2") - if "layer4_rn" in name: - name = name.replace("layer4_rn", "convs.3") - if "refinenet" in name: - layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1]) - # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3 - name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx - 4)}") - if "out_conv" in name: - name = name.replace("out_conv", "projection") - if "resConfUnit1" in name: - name = name.replace("resConfUnit1", "residual_layer1") - if "resConfUnit2" in name: - name = name.replace("resConfUnit2", "residual_layer2") - if "conv1" in name: - name = name.replace("conv1", "convolution1") - if "conv2" in name: - name = name.replace("conv2", "convolution2") - # readout blocks - if "pretrained.act_postprocess1.0.project.0" in name: - name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0") - if "pretrained.act_postprocess2.0.project.0" in name: - name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0") - if "pretrained.act_postprocess3.0.project.0" in name: - name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0") - if "pretrained.act_postprocess4.0.project.0" in name: - name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0") - - # resize blocks - if "pretrained.act_postprocess1.3" in name: - name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection") - if "pretrained.act_postprocess1.4" in name: - name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize") - if "pretrained.act_postprocess2.3" in name: - name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection") - if "pretrained.act_postprocess2.4" in name: - name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize") - if "pretrained.act_postprocess3.3" in name: - name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection") - if "pretrained.act_postprocess4.3" in name: - name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection") - if "pretrained.act_postprocess4.4" in name: - name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize") - if "pretrained" in name: - name = name.replace("pretrained", "dpt") - if "bn" in name: - name = name.replace("bn", "batch_norm") - if "head" in name: - name = name.replace("head", "head.head") - if "encoder.norm" in name: - name = name.replace("encoder.norm", "layernorm") - if "auxlayer" in name: - name = name.replace("auxlayer", "auxiliary_head.head") - if "backbone" in name: - name = name.replace("backbone", "backbone.bit.encoder") - - if ".." in name: - name = name.replace("..", ".") - - if "stem.conv" in name: - name = name.replace("stem.conv", "bit.embedder.convolution") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "convolution" in name and "backbone" in name: - name = name.replace("convolution", "conv") - if "layer" in name and "backbone" in name: - name = name.replace("layer", "layers") - if "backbone.bit.encoder.bit" in name: - name = name.replace("backbone.bit.encoder.bit", "backbone.bit") - if "embedder.conv" in name: - name = name.replace("embedder.conv", "embedder.convolution") - if "backbone.bit.encoder.stem.norm" in name: - name = name.replace("backbone.bit.encoder.stem.norm", "backbone.bit.embedder.norm") - return name - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name, show_prediction): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - config, expected_shape = get_dpt_config(checkpoint_url) - # load original state_dict from URL - # state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - state_dict = torch.load(checkpoint_url, map_location="cpu", weights_only=True) - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image - size = 480 if "ade" in checkpoint_url else 384 - image_processor = DPTImageProcessor(size=size) - - image = prepare_img() - encoding = image_processor(image, return_tensors="pt") - - # forward pass - outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth - - if show_prediction: - prediction = ( - torch.nn.functional.interpolate( - outputs.unsqueeze(1), - size=(image.size[1], image.size[0]), - mode="bicubic", - align_corners=False, - ) - .squeeze() - .cpu() - .numpy() - ) - - Image.fromarray((prediction / prediction.max()) * 255).show() - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub("ybelkada/dpt-hybrid-midas") - image_processor.push_to_hub("ybelkada/dpt-hybrid-midas") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt", - type=str, - help="URL of the original DPT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - parser.add_argument( - "--model_name", - default="dpt-large", - type=str, - help="Name of the model, in case you're pushing to the hub.", - ) - parser.add_argument( - "--show_prediction", - action="store_true", - ) - - args = parser.parse_args() - convert_dpt_checkpoint( - args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name, args.show_prediction - ) diff --git a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py b/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py deleted file mode 100644 index 0feebe72d4..0000000000 --- a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py +++ /dev/null @@ -1,321 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS""" - -import argparse -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTImageProcessor, Swinv2Config -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(model_name): - if "tiny" in model_name: - embed_dim = 96 - depths = (2, 2, 6, 2) - num_heads = (3, 6, 12, 24) - window_size = 16 - # note: for Swinv2-tiny authors used the window_size = 16 variant - # as seen here: https://github.com/isl-org/MiDaS/blob/bdc4ed64c095e026dc0a2f17cabb14d58263decb/midas/backbones/swin2.py#L26 - pretrained_window_sizes = (0, 0, 0, 0) - elif "base" in model_name: - embed_dim = 128 - depths = (2, 2, 18, 2) - num_heads = (4, 8, 16, 32) - window_size = 24 - pretrained_window_sizes = (12, 12, 12, 6) - elif "large" in model_name: - embed_dim = 192 - depths = (2, 2, 18, 2) - num_heads = (6, 12, 24, 48) - window_size = 24 - pretrained_window_sizes = (12, 12, 12, 6) - - if "384" in model_name: - image_size = 384 - elif "256" in model_name: - image_size = 256 - else: - raise ValueError("Model not supported, to do") - - backbone_config = Swinv2Config( - image_size=image_size, - embed_dim=embed_dim, - depths=depths, - window_size=window_size, - pretrained_window_sizes=pretrained_window_sizes, - num_heads=num_heads, - out_features=["stage1", "stage2", "stage3", "stage4"], - ) - - if model_name == "dpt-swinv2-tiny-256": - neck_hidden_sizes = [96, 192, 384, 768] - elif model_name == "dpt-swinv2-base-384": - neck_hidden_sizes = [128, 256, 512, 1024] - elif model_name == "dpt-swinv2-large-384": - neck_hidden_sizes = [192, 384, 768, 1536] - - config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes) - - return config, image_size - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - - # fmt: off - # stem - rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("pretrained.model.patch_embed.norm.weight", "backbone.embeddings.norm.weight")) - rename_keys.append(("pretrained.model.patch_embed.norm.bias", "backbone.embeddings.norm.bias")) - - # transformer encoder - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.logit_scale", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.logit_scale")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.0.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.0.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.cpb_mlp.2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.continuous_position_bias_mlp.2.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.q_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.v_bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.weight", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.attn.proj.bias", f"backbone.encoder.layers.{i}.blocks.{j}.attention.output.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_before.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.weight", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc1.bias", f"backbone.encoder.layers.{i}.blocks.{j}.intermediate.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.mlp.fc2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.output.dense.bias")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.weight", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.blocks.{j}.norm2.bias", f"backbone.encoder.layers.{i}.blocks.{j}.layernorm_after.bias")) - - # downsample parameters - if i in [0,1,2]: - rename_keys.append((f"pretrained.model.layers.{i}.downsample.reduction.weight", f"backbone.encoder.layers.{i}.downsample.reduction.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.weight", f"backbone.encoder.layers.{i}.downsample.norm.weight")) - rename_keys.append((f"pretrained.model.layers.{i}.downsample.norm.bias", f"backbone.encoder.layers.{i}.downsample.norm.bias")) - - # note: non-Transformer backbones like Swinv2, LeViT et al don't require activation postprocessing (readout projections + resize blocks) - - # refinenet (tricky here) - mapping = {1:3, 2:2, 3:1, 4:0} - - for i in range(1, 5): - j = mapping[i] - rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight")) - rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight")) - rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias")) - - # scratch convolutions - for i in range(4): - rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight")) - - # head - for i in range(0, 5, 2): - rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight")) - rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias")) - - return rename_keys - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, model): - for i in range(len(config.backbone_config.depths)): - for j in range(config.backbone_config.depths[i]): - dim = model.backbone.encoder.layers[i].blocks[j].attention.self.all_head_size - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"pretrained.model.layers.{i}.blocks.{j}.attn.qkv.weight") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :] - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"backbone.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[ - -dim:, : - ] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, verify_logits, push_to_hub): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - name_to_url = { - "dpt-swinv2-tiny-256": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt", - "dpt-swinv2-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt", - "dpt-swinv2-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt", - } - - # define DPT configuration based on URL - checkpoint_url = name_to_url[model_name] - config, image_size = get_dpt_config(model_name) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - - # load HuggingFace model - model = DPTForDepthEstimation(config) - - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - # read in qkv matrices - read_in_q_k_v(state_dict, config, model) - - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - model.eval() - - # Check outputs on an image - processor = DPTImageProcessor(size={"height": image_size, "width": image_size}) - - image = prepare_img() - processor(image, return_tensors="pt") - - if verify_logits: - from torchvision import transforms - - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - transforms = transforms.Compose( - [ - transforms.Resize((image_size, image_size)), - transforms.ToTensor(), - ] - ) - pixel_values = transforms(image).unsqueeze(0) - - # forward pass - with torch.no_grad(): - outputs = model(pixel_values) - - predicted_depth = outputs.predicted_depth - - print("Shape of predicted depth:", predicted_depth.shape) - print("First values of predicted depth:", predicted_depth[0, :3, :3]) - - # assert logits - if model_name == "dpt-swinv2-base-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [ - [1998.5575, 1997.3887, 2009.2981], - [1952.8607, 1979.6488, 2001.0854], - [1953.7697, 1961.7711, 1968.8904], - ], - ) - elif model_name == "dpt-swinv2-tiny-256": - # OK, checked - expected_shape = torch.Size([1, 256, 256]) - expected_slice = torch.tensor( - [[978.9163, 976.5215, 978.5349], [974.1859, 971.7249, 975.8046], [971.3419, 970.3118, 971.6830]], - ) - elif model_name == "dpt-swinv2-large-384": - # OK, checked - expected_shape = torch.Size([1, 384, 384]) - expected_slice = torch.tensor( - [ - [1203.7206, 1200.1495, 1197.8234], - [1196.2484, 1183.5033, 1186.4640], - [1178.8131, 1182.3260, 1174.3975], - ], - ) - - assert predicted_depth.shape == torch.Size(expected_shape) - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model and processor to hub...") - model.push_to_hub(repo_id=f"Intel/{model_name}") - processor.push_to_hub(repo_id=f"Intel/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="dpt-swinv2-base-384", - type=str, - choices=["dpt-swinv2-tiny-256", "dpt-swinv2-base-384", "dpt-swinv2-large-384"], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--verify_logits", - action="store_true", - help="Whether to verify logits after conversion.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub after conversion.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.verify_logits, args.push_to_hub) diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py deleted file mode 100644 index 55e0a444e8..0000000000 --- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py +++ /dev/null @@ -1,285 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT""" - -import argparse -import json -from pathlib import Path - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers import DPTConfig, DPTForDepthEstimation, DPTForSemanticSegmentation, DPTImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_dpt_config(checkpoint_url): - config = DPTConfig() - - if "large" in checkpoint_url: - config.hidden_size = 1024 - config.intermediate_size = 4096 - config.num_hidden_layers = 24 - config.num_attention_heads = 16 - config.backbone_out_indices = [5, 11, 17, 23] - config.neck_hidden_sizes = [256, 512, 1024, 1024] - expected_shape = (1, 384, 384) - - if "ade" in checkpoint_url: - config.use_batch_norm_in_fusion_residual = True - - config.num_labels = 150 - repo_id = "huggingface/label-files" - filename = "ade20k-id2label.json" - id2label = json.loads(Path(hf_hub_download(repo_id, filename, repo_type="dataset")).read_text()) - id2label = {int(k): v for k, v in id2label.items()} - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - expected_shape = [1, 150, 480, 480] - - return config, expected_shape - - -def remove_ignore_keys_(state_dict): - ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(name): - if ( - "pretrained.model" in name - and "cls_token" not in name - and "pos_embed" not in name - and "patch_embed" not in name - ): - name = name.replace("pretrained.model", "dpt.encoder") - if "pretrained.model" in name: - name = name.replace("pretrained.model", "dpt.embeddings") - if "patch_embed" in name: - name = name.replace("patch_embed", "patch_embeddings") - if "pos_embed" in name: - name = name.replace("pos_embed", "position_embeddings") - if "attn.proj" in name: - name = name.replace("attn.proj", "attention.output.dense") - if "proj" in name and "project" not in name: - name = name.replace("proj", "projection") - if "blocks" in name: - name = name.replace("blocks", "layer") - if "mlp.fc1" in name: - name = name.replace("mlp.fc1", "intermediate.dense") - if "mlp.fc2" in name: - name = name.replace("mlp.fc2", "output.dense") - if "norm1" in name: - name = name.replace("norm1", "layernorm_before") - if "norm2" in name: - name = name.replace("norm2", "layernorm_after") - if "scratch.output_conv" in name: - name = name.replace("scratch.output_conv", "head") - if "scratch" in name: - name = name.replace("scratch", "neck") - if "layer1_rn" in name: - name = name.replace("layer1_rn", "convs.0") - if "layer2_rn" in name: - name = name.replace("layer2_rn", "convs.1") - if "layer3_rn" in name: - name = name.replace("layer3_rn", "convs.2") - if "layer4_rn" in name: - name = name.replace("layer4_rn", "convs.3") - if "refinenet" in name: - layer_idx = int(name[len("neck.refinenet") : len("neck.refinenet") + 1]) - # tricky here: we need to map 4 to 0, 3 to 1, 2 to 2 and 1 to 3 - name = name.replace(f"refinenet{layer_idx}", f"fusion_stage.layers.{abs(layer_idx - 4)}") - if "out_conv" in name: - name = name.replace("out_conv", "projection") - if "resConfUnit1" in name: - name = name.replace("resConfUnit1", "residual_layer1") - if "resConfUnit2" in name: - name = name.replace("resConfUnit2", "residual_layer2") - if "conv1" in name: - name = name.replace("conv1", "convolution1") - if "conv2" in name: - name = name.replace("conv2", "convolution2") - # readout blocks - if "pretrained.act_postprocess1.0.project.0" in name: - name = name.replace("pretrained.act_postprocess1.0.project.0", "neck.reassemble_stage.readout_projects.0.0") - if "pretrained.act_postprocess2.0.project.0" in name: - name = name.replace("pretrained.act_postprocess2.0.project.0", "neck.reassemble_stage.readout_projects.1.0") - if "pretrained.act_postprocess3.0.project.0" in name: - name = name.replace("pretrained.act_postprocess3.0.project.0", "neck.reassemble_stage.readout_projects.2.0") - if "pretrained.act_postprocess4.0.project.0" in name: - name = name.replace("pretrained.act_postprocess4.0.project.0", "neck.reassemble_stage.readout_projects.3.0") - # resize blocks - if "pretrained.act_postprocess1.3" in name: - name = name.replace("pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection") - if "pretrained.act_postprocess1.4" in name: - name = name.replace("pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize") - if "pretrained.act_postprocess2.3" in name: - name = name.replace("pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection") - if "pretrained.act_postprocess2.4" in name: - name = name.replace("pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize") - if "pretrained.act_postprocess3.3" in name: - name = name.replace("pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection") - if "pretrained.act_postprocess4.3" in name: - name = name.replace("pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection") - if "pretrained.act_postprocess4.4" in name: - name = name.replace("pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize") - if "pretrained" in name: - name = name.replace("pretrained", "dpt") - if "bn" in name: - name = name.replace("bn", "batch_norm") - if "head" in name: - name = name.replace("head", "head.head") - if "encoder.norm" in name: - name = name.replace("encoder.norm", "layernorm") - if "auxlayer" in name: - name = name.replace("auxlayer", "auxiliary_head.head") - - return name - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"dpt.encoder.layer.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ - -config.hidden_size :, : - ] - state_dict[f"dpt.encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_dpt_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub, model_name): - """ - Copy/paste/tweak model's weights to our DPT structure. - """ - - # define DPT configuration based on URL - config, expected_shape = get_dpt_config(checkpoint_url) - # load original state_dict from URL - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") - # remove certain keys - remove_ignore_keys_(state_dict) - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - # read in qkv matrices - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = DPTForSemanticSegmentation(config) if "ade" in checkpoint_url else DPTForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # Check outputs on an image - size = 480 if "ade" in checkpoint_url else 384 - image_processor = DPTImageProcessor(size=size) - - image = prepare_img() - encoding = image_processor(image, return_tensors="pt") - - # forward pass - outputs = model(**encoding).logits if "ade" in checkpoint_url else model(**encoding).predicted_depth - - # Assert logits - expected_slice = torch.tensor([[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]]) - if "ade" in checkpoint_url: - expected_slice = torch.tensor([[4.0480, 4.2420, 4.4360], [4.3124, 4.5693, 4.8261], [4.5768, 4.8965, 5.2163]]) - assert outputs.shape == torch.Size(expected_shape) - assert ( - torch.allclose(outputs[0, 0, :3, :3], expected_slice, atol=1e-4) - if "ade" in checkpoint_url - else torch.allclose(outputs[0, :3, :3], expected_slice) - ) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing model to hub...") - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--checkpoint_url", - default="https://github.com/intel-isl/DPT/releases/download/1_0/dpt_large-midas-2f21e586.pt", - type=str, - help="URL of the original DPT checkpoint you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - required=False, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - ) - parser.add_argument( - "--model_name", - default="dpt-large", - type=str, - required=False, - help="Name of the model, in case you're pushing to the hub.", - ) - - args = parser.parse_args() - convert_dpt_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name) diff --git a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py b/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py deleted file mode 100644 index e9988524ac..0000000000 --- a/src/transformers/models/efficientnet/convert_efficientnet_to_pytorch.py +++ /dev/null @@ -1,339 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert EfficientNet checkpoints from the original repository. - -URL: https://github.com/keras-team/keras/blob/v2.11.0/keras/applications/efficientnet.py""" - -import argparse -import json -import os - -import numpy as np -import PIL -import requests -import tensorflow.keras.applications.efficientnet as efficientnet -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from tensorflow.keras.preprocessing import image - -from transformers import ( - EfficientNetConfig, - EfficientNetForImageClassification, - EfficientNetImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -model_classes = { - "b0": efficientnet.EfficientNetB0, - "b1": efficientnet.EfficientNetB1, - "b2": efficientnet.EfficientNetB2, - "b3": efficientnet.EfficientNetB3, - "b4": efficientnet.EfficientNetB4, - "b5": efficientnet.EfficientNetB5, - "b6": efficientnet.EfficientNetB6, - "b7": efficientnet.EfficientNetB7, -} - -CONFIG_MAP = { - "b0": { - "hidden_dim": 1280, - "width_coef": 1.0, - "depth_coef": 1.0, - "image_size": 224, - "dropout_rate": 0.2, - "dw_padding": [], - }, - "b1": { - "hidden_dim": 1280, - "width_coef": 1.0, - "depth_coef": 1.1, - "image_size": 240, - "dropout_rate": 0.2, - "dw_padding": [16], - }, - "b2": { - "hidden_dim": 1408, - "width_coef": 1.1, - "depth_coef": 1.2, - "image_size": 260, - "dropout_rate": 0.3, - "dw_padding": [5, 8, 16], - }, - "b3": { - "hidden_dim": 1536, - "width_coef": 1.2, - "depth_coef": 1.4, - "image_size": 300, - "dropout_rate": 0.3, - "dw_padding": [5, 18], - }, - "b4": { - "hidden_dim": 1792, - "width_coef": 1.4, - "depth_coef": 1.8, - "image_size": 380, - "dropout_rate": 0.4, - "dw_padding": [6], - }, - "b5": { - "hidden_dim": 2048, - "width_coef": 1.6, - "depth_coef": 2.2, - "image_size": 456, - "dropout_rate": 0.4, - "dw_padding": [13, 27], - }, - "b6": { - "hidden_dim": 2304, - "width_coef": 1.8, - "depth_coef": 2.6, - "image_size": 528, - "dropout_rate": 0.5, - "dw_padding": [31], - }, - "b7": { - "hidden_dim": 2560, - "width_coef": 2.0, - "depth_coef": 3.1, - "image_size": 600, - "dropout_rate": 0.5, - "dw_padding": [18], - }, -} - - -def get_efficientnet_config(model_name): - config = EfficientNetConfig() - config.hidden_dim = CONFIG_MAP[model_name]["hidden_dim"] - config.width_coefficient = CONFIG_MAP[model_name]["width_coef"] - config.depth_coefficient = CONFIG_MAP[model_name]["depth_coef"] - config.image_size = CONFIG_MAP[model_name]["image_size"] - config.dropout_rate = CONFIG_MAP[model_name]["dropout_rate"] - config.depthwise_padding = CONFIG_MAP[model_name]["dw_padding"] - - repo_id = "huggingface/label-files" - filename = "imagenet-1k-id2label.json" - config.num_labels = 1000 - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - - config.id2label = id2label - config.label2id = {v: k for k, v in id2label.items()} - return config - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def convert_image_processor(model_name): - size = CONFIG_MAP[model_name]["image_size"] - preprocessor = EfficientNetImageProcessor( - size={"height": size, "width": size}, - image_mean=[0.485, 0.456, 0.406], - image_std=[0.47853944, 0.4732864, 0.47434163], - do_center_crop=False, - ) - return preprocessor - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def rename_keys(original_param_names): - block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")] - block_names = sorted(set(block_names)) - num_blocks = len(block_names) - block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))} - - rename_keys = [] - rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight")) - rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight")) - rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias")) - rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean")) - rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var")) - - for b in block_names: - hf_b = block_name_mapping[b] - rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight")) - rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight")) - rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias")) - rename_keys.append( - (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var") - ) - rename_keys.append( - (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight") - ) - rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight")) - rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias")) - rename_keys.append( - (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean") - ) - rename_keys.append( - (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var") - ) - - rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight")) - rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias")) - rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight")) - rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias")) - rename_keys.append( - (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight") - ) - rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight")) - rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias")) - rename_keys.append( - (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean") - ) - rename_keys.append( - (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var") - ) - - rename_keys.append(("top_conv/kernel:0", "encoder.top_conv.weight")) - rename_keys.append(("top_bn/gamma:0", "encoder.top_bn.weight")) - rename_keys.append(("top_bn/beta:0", "encoder.top_bn.bias")) - rename_keys.append(("top_bn/moving_mean:0", "encoder.top_bn.running_mean")) - rename_keys.append(("top_bn/moving_variance:0", "encoder.top_bn.running_var")) - - key_mapping = {} - for item in rename_keys: - if item[0] in original_param_names: - key_mapping[item[0]] = "efficientnet." + item[1] - - key_mapping["predictions/kernel:0"] = "classifier.weight" - key_mapping["predictions/bias:0"] = "classifier.bias" - return key_mapping - - -def replace_params(hf_params, tf_params, key_mapping): - for key, value in tf_params.items(): - if "normalization" in key: - continue - - hf_key = key_mapping[key] - if "_conv" in key and "kernel" in key: - new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1) - elif "depthwise_kernel" in key: - new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1) - elif "kernel" in key: - new_hf_value = torch.from_numpy(np.transpose(value)) - else: - new_hf_value = torch.from_numpy(value) - - # Replace HF parameters with original TF model parameters - assert hf_params[hf_key].shape == new_hf_value.shape - hf_params[hf_key].copy_(new_hf_value) - - -@torch.no_grad() -def convert_efficientnet_checkpoint(model_name, pytorch_dump_folder_path, save_model, push_to_hub): - """ - Copy/paste/tweak model's weights to our EfficientNet structure. - """ - # Load original model - original_model = model_classes[model_name]( - include_top=True, - weights="imagenet", - input_tensor=None, - input_shape=None, - pooling=None, - classes=1000, - classifier_activation="softmax", - ) - - tf_params = original_model.trainable_variables - tf_non_train_params = original_model.non_trainable_variables - tf_params = {param.name: param.numpy() for param in tf_params} - for param in tf_non_train_params: - tf_params[param.name] = param.numpy() - tf_param_names = list(tf_params.keys()) - - # Load HuggingFace model - config = get_efficientnet_config(model_name) - hf_model = EfficientNetForImageClassification(config).eval() - hf_params = hf_model.state_dict() - - # Create src-to-dst parameter name mapping dictionary - print("Converting parameters...") - key_mapping = rename_keys(tf_param_names) - replace_params(hf_params, tf_params, key_mapping) - - # Initialize preprocessor and preprocess input image - preprocessor = convert_image_processor(model_name) - inputs = preprocessor(images=prepare_img(), return_tensors="pt") - - # HF model inference - hf_model.eval() - with torch.no_grad(): - outputs = hf_model(**inputs) - hf_logits = outputs.logits.detach().numpy() - - # Original model inference - original_model.trainable = False - image_size = CONFIG_MAP[model_name]["image_size"] - img = prepare_img().resize((image_size, image_size), resample=PIL.Image.NEAREST) - x = image.img_to_array(img) - x = np.expand_dims(x, axis=0) - original_logits = original_model.predict(x) - - # Check whether original and HF model outputs match -> np.allclose - assert np.allclose(original_logits, hf_logits, atol=1e-3), "The predicted logits are not the same." - print("Model outputs match!") - - if save_model: - # Create folder to save model - if not os.path.isdir(pytorch_dump_folder_path): - os.mkdir(pytorch_dump_folder_path) - # Save converted model and image processor - hf_model.save_pretrained(pytorch_dump_folder_path) - preprocessor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - # Push model and image processor to hub - print(f"Pushing converted {model_name} to the hub...") - model_name = f"efficientnet-{model_name}" - preprocessor.push_to_hub(model_name) - hf_model.push_to_hub(model_name) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="b0", - type=str, - help="Version name of the EfficientNet model you want to convert, select from [b0, b1, b2, b3, b4, b5, b6, b7].", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default="hf_model", - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument("--save_model", action="store_true", help="Save model to local") - parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") - - args = parser.parse_args() - convert_efficientnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) diff --git a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py deleted file mode 100644 index b0abc30cd7..0000000000 --- a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,79 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ELECTRA checkpoint.""" - -import argparse - -import torch - -from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): - # Initialise PyTorch model - config = ElectraConfig.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - - if discriminator_or_generator == "discriminator": - model = ElectraForPreTraining(config) - elif discriminator_or_generator == "generator": - model = ElectraForMaskedLM(config) - else: - raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") - - # Load weights from tf checkpoint - load_tf_weights_in_electra( - model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator - ) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--discriminator_or_generator", - default=None, - type=str, - required=True, - help=( - "Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or " - "'generator'." - ), - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator - ) diff --git a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py deleted file mode 100644 index 8ac8db7e42..0000000000 --- a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py +++ /dev/null @@ -1,448 +0,0 @@ -# Copyright 2024 The Emu team, BAAI and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import os -import re -from typing import Dict, Optional - -import requests -import torch -from accelerate import init_empty_weights -from PIL import Image - -from transformers import ( - AutoModel, - AutoModelForCausalLM, - AutoTokenizer, - Emu3Config, - Emu3ForConditionalGeneration, - Emu3ImageProcessor, - Emu3Processor, - Emu3TextConfig, - GenerationConfig, -) -from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode - - -""" -Sample usage: - -``` -python src/transformers/models/emu3/convert_emu3_weights_to_hf.py \ - --vq_model_id BAAI/Emu3-VisionTokenizer --llm_model_id BAAI/Emu3-Chat --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import Emu3ForConditionalGeneration, Emu3Processor - -model = Emu3ForConditionalGeneration.from_pretrained("/output/path") -processor = Emu3Processor.from_pretrained("/output/path") -``` - -""" - - -byte_encoder = bytes_to_unicode() -CHAT_TEMPLATE = "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}" - - -# Tiktoken to HF conversion, thanks for Xenova -def token_bytes_to_string(b): - return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")]) - - -# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960 -def bpe(mergeable_ranks: Dict[bytes, int], token: bytes, max_rank: Optional[int] = None): - parts = [bytes([b]) for b in token] - while True: - min_idx = None - min_rank = None - for i, pair in enumerate(zip(parts[:-1], parts[1:])): - rank = mergeable_ranks.get(pair[0] + pair[1]) - if rank is not None and (min_rank is None or rank < min_rank): - min_idx = i - min_rank = rank - if min_rank is None or (max_rank is not None and min_rank >= max_rank): - break - assert min_idx is not None - parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :] - return parts - - -def generate_vocab_and_merges(encoder): - mergeable_ranks = encoder._mergeable_ranks - - merges = [] - vocab = {} - for token, rank in mergeable_ranks.items(): - vocab[token_bytes_to_string(token)] = rank - - if len(token) == 1: - continue - merged = tuple(bpe(mergeable_ranks, token, max_rank=rank)) - assert len(merged) == 2 - merges.append(" ".join(map(token_bytes_to_string, merged))) - - # Also add special tokens - vocab.update(encoder._special_tokens) - return vocab, merges - - -def convert_tiktoken(tokenizer, output_dir): - encoder = tokenizer.tokenizer - vocab, merges = generate_vocab_and_merges(encoder) - added_tokens = [ - { - "id": id, - "content": content, - "single_word": False, - "lstrip": False, - "rstrip": False, - "normalized": False, - "special": True, - } - for content, id in encoder._special_tokens.items() - if content != "<|extra_0|>" - ] - - # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer_config.json - tokenizer_config_template = { - "add_prefix_space": False, - "bos_token": "<|extra_203|>", - "clean_up_tokenization_spaces": False, - "eos_token": "<|extra_204|>", - "pad_token": "<|endoftext|>", - } - tokenizer_config_template.update({"tokenizer_class": "GPT2Tokenizer"}) - tokenizer_config_template = dict(sorted(tokenizer_config_template.items(), key=lambda x: x[0])) - - # add placeholder image token by taking one of the reserved tokens - reserved_token_id = vocab["<|extra_0|>"] - vocab[""] = reserved_token_id - del vocab["<|extra_0|>"] - added_tokens.append( - { - "id": reserved_token_id, - "content": "", - "single_word": False, - "lstrip": False, - "rstrip": False, - "normalized": False, - "special": True, - } - ) - - os.makedirs(output_dir, exist_ok=True) - - pre_tokenizer = { - "type": "ByteLevel", - "add_prefix_space": False, - "trim_offsets": True, - "use_regex": True, - } - - # https://huggingface.co/Xenova/gpt2/raw/main/tokenizer.json - tokenizer_template = { - "version": "1.0", - "truncation": None, - "padding": None, - "added_tokens": added_tokens, - "normalizer": None, - "pre_tokenizer": pre_tokenizer, - "post_processor": None, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": True, - "trim_offsets": True, - "use_regex": True, - }, - "model": { - "type": "BPE", - "dropout": None, - "unk_token": None, - "continuing_subword_prefix": "", - "end_of_word_suffix": "", - "fuse_unk": False, - "byte_fallback": False, - "vocab": vocab, - "merges": merges, - }, - } - - # Save to files - with open(os.path.join(output_dir, "vocab.json"), "w", encoding="utf-8") as fp: - json.dump(vocab, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "tokenizer.json"), "w", encoding="utf-8") as fp: - json.dump(tokenizer_template, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "tokenizer_config.json"), "w", encoding="utf-8") as fp: - json.dump(tokenizer_config_template, fp, indent=2, ensure_ascii=False) - - with open(os.path.join(output_dir, "special_tokens_map.json"), "w", encoding="utf-8") as fp: - json.dump( - { - "bos_token": "<|extra_203|>", - "eos_token": "<|extra_204|>", - "pad_token": "<|endoftext|>", - }, - fp, - indent=2, - ensure_ascii=False, - ) - - with open(os.path.join(output_dir, "merges.txt"), "w", encoding="utf-8") as fp: - fp.write("#version: 0.2\n") - fp.write("\n".join(merges)) - - -KEYS_TO_MODIFY_MAPPING = { - "^encoder": "model.vqmodel.encoder", - "^decoder": "model.vqmodel.decoder", - "^post_quant_conv": "model.vqmodel.post_quant_conv", - "^quant_conv": "model.vqmodel.quant_conv", - "^quantize": "model.vqmodel.quantize", - "^model": "text_model.model", - r"lm_head\.weight": "text_model.lm_head.weight", - r"^text_model\.model\.vqmodel": "vqmodel", - # rename QKV proj for the VQ-VAE model because we use SiglipAttention - r"\.q\.": ".q_proj.", - r"\.k\.": ".k_proj.", - r"\.v\.": ".v_proj.", - r"\.proj_out\.": ".out_proj.", - # move the attention norms outside of attention modules - r"mid\.attn_1\.norm\.": "mid.attn_norm.", - r"attn\.0\.norm\.": "attn_norms.0.", - r"attn\.1\.norm\.": "attn_norms.1.", - r"attn\.2\.norm\.": "attn_norms.2.", - r"attn\.3\.norm\.": "attn_norms.3.", - # isolate down/mid/up into separate classes for readability - r"\.down\.": ".down_block.down.", - r"\.up\.": ".up_block.up.", - r"\.mid\.": ".middle_block.", -} - - -def convert_state_dict_to_hf(old_state_dict, new_state_dict): - for key, value in old_state_dict.items(): - # convert conv layers in attn to linear - if ( - any(key.endswith(name) for name in ["q.weight", "k.weight", "v.weight", "proj_out.weight"]) - and value.ndim == 4 - ): - value = value.squeeze() - - for old_pattern, new_pattern in KEYS_TO_MODIFY_MAPPING.items(): - key = re.sub(old_pattern, new_pattern, key) - - new_state_dict[key] = value - return new_state_dict - - -def convert_model(vq_model_id, llm_model_id, output_dir, hub_model_id=None, test_inference=False): - os.makedirs(output_dir, exist_ok=True) - - # Convert and save processor - tokenizer_tiktoken = AutoTokenizer.from_pretrained(llm_model_id, trust_remote_code=True) - convert_tiktoken(tokenizer_tiktoken, output_dir) - extra_special_tokens = extra_special_tokens = { - "image_token": "", - "boi_token": "<|image start|>", - "eoi_token": "<|image end|>", - "image_wrapper_token": "<|image token|>", - "eof_token": "<|extra_201|>", - } - tokenizer_converted = AutoTokenizer.from_pretrained(output_dir, extra_special_tokens=extra_special_tokens) - tokenizer_converted.padding_side = "left" - - image_processor = Emu3ImageProcessor.from_pretrained(vq_model_id) - processor = Emu3Processor(image_processor, tokenizer_converted, chat_template=CHAT_TEMPLATE) - processor.save_pretrained(output_dir) - - # load models - model_llm = AutoModelForCausalLM.from_pretrained( - llm_model_id, - trust_remote_code=True, - ) - model_vqgan = AutoModel.from_pretrained(vq_model_id, trust_remote_code=True) - with open(f"{output_dir}/tokenizer.json", "r") as file: - tokenizer_config = json.load(file) - vocabulary_map = tokenizer_config["model"]["vocab"] - - text_config = Emu3TextConfig( - max_position_embeddings=model_llm.config.max_position_embeddings, - rope_scaling={"rope_type": "default"}, - ) - config = Emu3Config(text_config=text_config, vocabulary_map=vocabulary_map) - - with init_empty_weights(): - model = Emu3ForConditionalGeneration(config=config) - model.generation_config = GenerationConfig( - do_sample=True, - top_k=2048, - max_new_tokens=50_000, - pad_token_id=processor.tokenizer.pad_token_id, - eos_token_id=processor.tokenizer.eos_token_id, - ) - - state_dict = {} - state_dict = convert_state_dict_to_hf(model_llm.state_dict(), state_dict) - state_dict = convert_state_dict_to_hf(model_vqgan.state_dict(), state_dict) - - model.load_state_dict(state_dict, assign=True, strict=True) - model.save_pretrained(output_dir, safe_serialization=True) - - if hub_model_id is not None: - model.push_to_hub(hub_model_id) - processor.push_to_hub(hub_model_id) - - if test_inference and llm_model_id.endswith("Chat"): - # Short inference on a few examples to check if generation makes sense - print("Loading the checkpoint in a Emu3 model...") - print("*" * 100) - model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") - processor = Emu3Processor.from_pretrained(output_dir) - - conversation = [ - { - "role": "system", - "content": [ - {"type": "text", "text": "You are a helpful assistant."}, - ], - }, - { - "role": "user", - "content": [ - {"type": "text", "text": "Please tell me about this art work and its artist."}, - {"type": "image"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - - image = Image.open( - requests.get( - "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True - ).raw - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.bfloat16) - length = inputs.input_ids.shape[1] - - out = model.generate(**inputs, max_new_tokens=40, do_sample=False) - generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] - - print(f"Generation for single-image: {generated_text}") - print("*" * 100) - elif test_inference and llm_model_id.endswith("Gen"): - processor = Emu3Processor.from_pretrained(output_dir) - model = Emu3ForConditionalGeneration.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") - - inputs = processor( - text=[ - "a portrait of young girl. masterpiece, film grained, best quality.", - "a dog running under the rain", - ], - padding=True, - return_tensors="pt", - return_for_image_generation=True, - ) - inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16) - - neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry." - neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0") - - image_sizes = inputs.pop("image_sizes") - HEIGHT, WIDTH = image_sizes[0] - VISUAL_TOKENS = model.vocabulary_mapping.image_tokens - - def prefix_allowed_tokens_fn(batch_id, input_ids): - height, width = HEIGHT, WIDTH - visual_tokens = VISUAL_TOKENS - image_token_id = processor.tokenizer.encode("<|image token|>", return_tensors="pt")[0].to(model.device) - eoi_token_id = processor.tokenizer.encode("<|image end|>", return_tensors="pt")[0] - eos_token_id = processor.tokenizer.encode("<|extra_204|>", return_tensors="pt")[0] - pad_token_id = processor.tokenizer.encode("<|endoftext|>", return_tensors="pt")[0] - eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0] - eof_token_id = processor.tokenizer.encode("<|extra_201|>", return_tensors="pt")[0] - - position = torch.nonzero(input_ids == image_token_id, as_tuple=True)[0][0] - offset = input_ids.shape[0] - position - if offset % (width + 1) == 0: - return (eol_token_id,) - elif offset == (width + 1) * height + 1: - return (eof_token_id,) - elif offset == (width + 1) * height + 2: - return (eoi_token_id,) - elif offset == (width + 1) * height + 3: - return (eos_token_id,) - elif offset > (width + 1) * height + 3: - return (pad_token_id,) - else: - return visual_tokens - - out = model.generate( - **inputs, - prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, - negative_prompt_ids=neg_inputs.input_ids, - negative_prompt_attention_mask=neg_inputs.attention_mask, - ) - - image = model.decode_image_tokens(out[:, inputs.input_ids.shape[1] :], height=HEIGHT, width=WIDTH) - images = processor.postprocess( - list(image.float()), return_tensors="PIL.Image.Image" - ) # internally we convert to np but it's not supported in bf16 precision - for i, image in enumerate(images["pixel_values"]): - image.save(f"result_{i}.png") - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--vq_model_id", - help="Model ID of Emu3 VQ-VAE on the hub", - default="BAAI/Emu3-VisionTokenizer", - ) - parser.add_argument( - "--llm_model_id", - help="Model ID of Emu3 bacbone LLM on the hub", - default="BAAI/Emu3-Chat", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model", - ) - parser.add_argument( - "--hub_model_id", - help="Model ID in the hub where to push the model.", - ) - parser.add_argument( - "--test_inference", - action="store_true", - help="Whether to load the model for generation to test it's converted correctly.", - ) - args = parser.parse_args() - convert_model( - vq_model_id=args.vq_model_id, - llm_model_id=args.llm_model_id, - output_dir=args.output_dir, - hub_model_id=args.hub_model_id, - test_inference=args.test_inference, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py b/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py deleted file mode 100644 index f1fb016870..0000000000 --- a/src/transformers/models/encodec/convert_encodec_checkpoint_to_pytorch.py +++ /dev/null @@ -1,365 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert EnCodec checkpoints.""" - -import argparse - -import torch - -from transformers import ( - EncodecConfig, - EncodecFeatureExtractor, - EncodecModel, - logging, -) - - -# checkpoints downloaded from: -# https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th -# https://huggingface.co/facebook/musicgen-small/resolve/main/compression_state_dict.bin -# https://dl.fbaipublicfiles.com/encodec/v0/encodec_48khz-7e698e3e.th - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.encodec") - -MAPPING_QUANTIZER = { - "quantizer.vq.layers.*._codebook.inited": "quantizer.layers.*.codebook.inited", - "quantizer.vq.layers.*._codebook.cluster_size": "quantizer.layers.*.codebook.cluster_size", - "quantizer.vq.layers.*._codebook.embed": "quantizer.layers.*.codebook.embed", - "quantizer.vq.layers.*._codebook.embed_avg": "quantizer.layers.*.codebook.embed_avg", -} -MAPPING_ENCODER = { - "encoder.model.0.conv.conv": "encoder.layers.0.conv", - "encoder.model.1.block.1.conv.conv": "encoder.layers.1.block.1.conv", - "encoder.model.1.block.3.conv.conv": "encoder.layers.1.block.3.conv", - "encoder.model.1.shortcut.conv.conv": "encoder.layers.1.shortcut.conv", - "encoder.model.3.conv.conv": "encoder.layers.3.conv", - "encoder.model.4.block.1.conv.conv": "encoder.layers.4.block.1.conv", - "encoder.model.4.block.3.conv.conv": "encoder.layers.4.block.3.conv", - "encoder.model.4.shortcut.conv.conv": "encoder.layers.4.shortcut.conv", - "encoder.model.6.conv.conv": "encoder.layers.6.conv", - "encoder.model.7.block.1.conv.conv": "encoder.layers.7.block.1.conv", - "encoder.model.7.block.3.conv.conv": "encoder.layers.7.block.3.conv", - "encoder.model.7.shortcut.conv.conv": "encoder.layers.7.shortcut.conv", - "encoder.model.9.conv.conv": "encoder.layers.9.conv", - "encoder.model.10.block.1.conv.conv": "encoder.layers.10.block.1.conv", - "encoder.model.10.block.3.conv.conv": "encoder.layers.10.block.3.conv", - "encoder.model.10.shortcut.conv.conv": "encoder.layers.10.shortcut.conv", - "encoder.model.12.conv.conv": "encoder.layers.12.conv", - "encoder.model.13.lstm": "encoder.layers.13.lstm", - "encoder.model.15.conv.conv": "encoder.layers.15.conv", -} -MAPPING_ENCODER_48K = { - "encoder.model.0.conv.norm": "encoder.layers.0.norm", - "encoder.model.1.block.1.conv.norm": "encoder.layers.1.block.1.norm", - "encoder.model.1.block.3.conv.norm": "encoder.layers.1.block.3.norm", - "encoder.model.1.shortcut.conv.norm": "encoder.layers.1.shortcut.norm", - "encoder.model.3.conv.norm": "encoder.layers.3.norm", - "encoder.model.4.block.1.conv.norm": "encoder.layers.4.block.1.norm", - "encoder.model.4.block.3.conv.norm": "encoder.layers.4.block.3.norm", - "encoder.model.4.shortcut.conv.norm": "encoder.layers.4.shortcut.norm", - "encoder.model.6.conv.norm": "encoder.layers.6.norm", - "encoder.model.7.block.1.conv.norm": "encoder.layers.7.block.1.norm", - "encoder.model.7.block.3.conv.norm": "encoder.layers.7.block.3.norm", - "encoder.model.7.shortcut.conv.norm": "encoder.layers.7.shortcut.norm", - "encoder.model.9.conv.norm": "encoder.layers.9.norm", - "encoder.model.10.block.1.conv.norm": "encoder.layers.10.block.1.norm", - "encoder.model.10.block.3.conv.norm": "encoder.layers.10.block.3.norm", - "encoder.model.10.shortcut.conv.norm": "encoder.layers.10.shortcut.norm", - "encoder.model.12.conv.norm": "encoder.layers.12.norm", - "encoder.model.15.conv.norm": "encoder.layers.15.norm", -} -MAPPING_DECODER = { - "decoder.model.0.conv.conv": "decoder.layers.0.conv", - "decoder.model.1.lstm": "decoder.layers.1.lstm", - "decoder.model.3.convtr.convtr": "decoder.layers.3.conv", - "decoder.model.4.block.1.conv.conv": "decoder.layers.4.block.1.conv", - "decoder.model.4.block.3.conv.conv": "decoder.layers.4.block.3.conv", - "decoder.model.4.shortcut.conv.conv": "decoder.layers.4.shortcut.conv", - "decoder.model.6.convtr.convtr": "decoder.layers.6.conv", - "decoder.model.7.block.1.conv.conv": "decoder.layers.7.block.1.conv", - "decoder.model.7.block.3.conv.conv": "decoder.layers.7.block.3.conv", - "decoder.model.7.shortcut.conv.conv": "decoder.layers.7.shortcut.conv", - "decoder.model.9.convtr.convtr": "decoder.layers.9.conv", - "decoder.model.10.block.1.conv.conv": "decoder.layers.10.block.1.conv", - "decoder.model.10.block.3.conv.conv": "decoder.layers.10.block.3.conv", - "decoder.model.10.shortcut.conv.conv": "decoder.layers.10.shortcut.conv", - "decoder.model.12.convtr.convtr": "decoder.layers.12.conv", - "decoder.model.13.block.1.conv.conv": "decoder.layers.13.block.1.conv", - "decoder.model.13.block.3.conv.conv": "decoder.layers.13.block.3.conv", - "decoder.model.13.shortcut.conv.conv": "decoder.layers.13.shortcut.conv", - "decoder.model.15.conv.conv": "decoder.layers.15.conv", -} -MAPPING_DECODER_48K = { - "decoder.model.0.conv.norm": "decoder.layers.0.norm", - "decoder.model.3.convtr.norm": "decoder.layers.3.norm", - "decoder.model.4.block.1.conv.norm": "decoder.layers.4.block.1.norm", - "decoder.model.4.block.3.conv.norm": "decoder.layers.4.block.3.norm", - "decoder.model.4.shortcut.conv.norm": "decoder.layers.4.shortcut.norm", - "decoder.model.6.convtr.norm": "decoder.layers.6.norm", - "decoder.model.7.block.1.conv.norm": "decoder.layers.7.block.1.norm", - "decoder.model.7.block.3.conv.norm": "decoder.layers.7.block.3.norm", - "decoder.model.7.shortcut.conv.norm": "decoder.layers.7.shortcut.norm", - "decoder.model.9.convtr.norm": "decoder.layers.9.norm", - "decoder.model.10.block.1.conv.norm": "decoder.layers.10.block.1.norm", - "decoder.model.10.block.3.conv.norm": "decoder.layers.10.block.3.norm", - "decoder.model.10.shortcut.conv.norm": "decoder.layers.10.shortcut.norm", - "decoder.model.12.convtr.norm": "decoder.layers.12.norm", - "decoder.model.13.block.1.conv.norm": "decoder.layers.13.block.1.norm", - "decoder.model.13.block.3.conv.norm": "decoder.layers.13.block.3.norm", - "decoder.model.13.shortcut.conv.norm": "decoder.layers.13.shortcut.norm", - "decoder.model.15.conv.norm": "decoder.layers.15.norm", -} -MAPPING_24K = { - **MAPPING_QUANTIZER, - **MAPPING_ENCODER, - **MAPPING_DECODER, -} -MAPPING_48K = { - **MAPPING_QUANTIZER, - **MAPPING_ENCODER, - **MAPPING_ENCODER_48K, - **MAPPING_DECODER, - **MAPPING_DECODER_48K, -} -TOP_LEVEL_KEYS = [] -IGNORE_KEYS = [] - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - if hf_shape != value.shape: - raise ValueError( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - elif weight_type == "running_mean": - hf_pointer.running_mean.data = value - elif weight_type == "running_var": - hf_pointer.running_var.data = value - elif weight_type == "num_batches_tracked": - hf_pointer.num_batches_tracked.data = value - elif weight_type == "weight_ih_l0": - hf_pointer.weight_ih_l0.data = value - elif weight_type == "weight_hh_l0": - hf_pointer.weight_hh_l0.data = value - elif weight_type == "bias_ih_l0": - hf_pointer.bias_ih_l0.data = value - elif weight_type == "bias_hh_l0": - hf_pointer.bias_hh_l0.data = value - elif weight_type == "weight_ih_l1": - hf_pointer.weight_ih_l1.data = value - elif weight_type == "weight_hh_l1": - hf_pointer.weight_hh_l1.data = value - elif weight_type == "bias_ih_l1": - hf_pointer.bias_ih_l1.data = value - elif weight_type == "bias_hh_l1": - hf_pointer.bias_hh_l1.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.") - - -def should_ignore(name, ignore_keys): - for key in ignore_keys: - if key.endswith(".*"): - if name.startswith(key[:-1]): - return True - elif ".*." in key: - prefix, suffix = key.split(".*.") - if prefix in name and suffix in name: - return True - elif key in name: - return True - return False - - -def recursively_load_weights(orig_dict, hf_model, model_name): - unused_weights = [] - - if model_name in ["encodec_24khz", "encodec_32khz"]: - MAPPING = MAPPING_24K - elif model_name == "encodec_48khz": - MAPPING = MAPPING_48K - else: - raise ValueError(f"Unsupported model: {model_name}") - - for name, value in orig_dict.items(): - if should_ignore(name, IGNORE_KEYS): - logger.info(f"{name} was ignored") - continue - - is_used = False - for key, mapped_key in MAPPING.items(): - if "*" in key: - prefix, suffix = key.split(".*.") - if prefix in name and suffix in name: - key = suffix - - if key in name: - # HACK otherwise .embed gets initialized with .embed_avg too - if key.endswith("embed") and name.endswith("embed_avg"): - continue - - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight_ih_l0" in name: - weight_type = "weight_ih_l0" - elif "weight_hh_l0" in name: - weight_type = "weight_hh_l0" - elif "bias_ih_l0" in name: - weight_type = "bias_ih_l0" - elif "bias_hh_l0" in name: - weight_type = "bias_hh_l0" - elif "weight_ih_l1" in name: - weight_type = "weight_ih_l1" - elif "weight_hh_l1" in name: - weight_type = "weight_hh_l1" - elif "bias_ih_l1" in name: - weight_type = "bias_ih_l1" - elif "bias_hh_l1" in name: - weight_type = "bias_hh_l1" - elif "bias" in name: - weight_type = "bias" - elif "weight" in name: - weight_type = "weight" - elif "running_mean" in name: - weight_type = "running_mean" - elif "running_var" in name: - weight_type = "running_var" - elif "num_batches_tracked" in name: - weight_type = "num_batches_tracked" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -@torch.no_grad() -def convert_checkpoint( - model_name, - checkpoint_path, - pytorch_dump_folder_path, - config_path=None, - repo_id=None, -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = EncodecConfig.from_pretrained(config_path) - else: - config = EncodecConfig() - - if model_name == "encodec_24khz": - pass # config is already correct - elif model_name == "encodec_32khz": - config.upsampling_ratios = [8, 5, 4, 4] - config.target_bandwidths = [2.2] - config.num_filters = 64 - config.sampling_rate = 32_000 - config.codebook_size = 2048 - config.use_causal_conv = False - config.normalize = False - config.use_conv_shortcut = False - elif model_name == "encodec_48khz": - config.upsampling_ratios = [8, 5, 4, 2] - config.target_bandwidths = [3.0, 6.0, 12.0, 24.0] - config.sampling_rate = 48_000 - config.audio_channels = 2 - config.use_causal_conv = False - config.norm_type = "time_group_norm" - config.normalize = True - config.chunk_length_s = 1.0 - config.overlap = 0.01 - else: - raise ValueError(f"Unknown model name: {model_name}") - - model = EncodecModel(config) - - feature_extractor = EncodecFeatureExtractor( - feature_size=config.audio_channels, - sampling_rate=config.sampling_rate, - chunk_length_s=config.chunk_length_s, - overlap=config.overlap, - ) - feature_extractor.save_pretrained(pytorch_dump_folder_path) - - original_checkpoint = torch.load(checkpoint_path, weights_only=True) - if "best_state" in original_checkpoint: - # we might have a training state saved, in which case discard the yaml results and just retain the weights - original_checkpoint = original_checkpoint["best_state"] - recursively_load_weights(original_checkpoint, model, model_name) - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - feature_extractor.push_to_hub(repo_id) - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--model", - default="encodec_24khz", - type=str, - help="The model to convert. Should be one of 'encodec_24khz', 'encodec_32khz', 'encodec_48khz'.", - ) - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - convert_checkpoint( - args.model, - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.config_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py deleted file mode 100644 index 020dd4e576..0000000000 --- a/src/transformers/models/esm/convert_esm.py +++ /dev/null @@ -1,399 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert ESM checkpoint.""" - -import argparse -import pathlib -from pathlib import Path -from tempfile import TemporaryDirectory - -import esm as esm_module -import torch -from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences -from esm.esmfold.v1.pretrained import esmfold_v1 - -from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig -from transformers.models.esm.modeling_esm import ( - EsmForMaskedLM, - EsmForSequenceClassification, - EsmIntermediate, - EsmLayer, - EsmOutput, - EsmSelfAttention, - EsmSelfOutput, -) -from transformers.models.esm.modeling_esmfold import EsmForProteinFolding -from transformers.models.esm.tokenization_esm import EsmTokenizer -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SAMPLE_DATA = [ - ( - "protein1", - "MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA", - ), - ("protein2", "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA"), - ("protein3", "MKTVRQERLKSIRILERSKEPVSGAQLAEELSSRQVIVQDIAYLRSLGYNVATPRGYVLAGG"), - ("protein4", "MKTVRQERLKSIRILERSKEPVSGAQLAEELSSRQVIVQDIAYLRSLGYNVATPRGYVLA"), -] - -MODEL_MAPPING = { - "esm1b_t33_650M_UR50S": esm_module.pretrained.esm1b_t33_650M_UR50S, - "esm1v_t33_650M_UR90S_1": esm_module.pretrained.esm1v_t33_650M_UR90S_1, - "esm1v_t33_650M_UR90S_2": esm_module.pretrained.esm1v_t33_650M_UR90S_2, - "esm1v_t33_650M_UR90S_3": esm_module.pretrained.esm1v_t33_650M_UR90S_3, - "esm1v_t33_650M_UR90S_4": esm_module.pretrained.esm1v_t33_650M_UR90S_4, - "esm1v_t33_650M_UR90S_5": esm_module.pretrained.esm1v_t33_650M_UR90S_5, - "esm2_t48_15B_UR50D": esm_module.pretrained.esm2_t48_15B_UR50D, - "esm2_t36_3B_UR50D": esm_module.pretrained.esm2_t36_3B_UR50D, - "esm2_t33_650M_UR50D": esm_module.pretrained.esm2_t33_650M_UR50D, - "esm2_t30_150M_UR50D": esm_module.pretrained.esm2_t30_150M_UR50D, - "esm2_t12_35M_UR50D": esm_module.pretrained.esm2_t12_35M_UR50D, - "esm2_t6_8M_UR50D": esm_module.pretrained.esm2_t6_8M_UR50D, - "esmfold_v1": esmfold_v1, -} - -restypes = list("ARNDCQEGHILKMFPSTWYV") - -restypes_with_x = restypes + ["X"] -restypes_with_extras = restypes_with_x + ["", "", "", "", ""] - - -def get_esmfold_tokenizer(): - with TemporaryDirectory() as tempdir: - vocab = "\n".join(restypes_with_extras) - vocab_file = Path(tempdir) / "vocab.txt" - vocab_file.write_text(vocab) - hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file)) - hf_tokenizer.pad_token_id = 0 # Overlaps with 'A' but that seems to be what they want - return hf_tokenizer - - -def transfer_and_check_weights(original_module, our_module): - status = our_module.load_state_dict(original_module.state_dict()) - if status.missing_keys: - raise ValueError(f"Missing keys: {status.missing_keys}") - if status.unexpected_keys: - raise ValueError(f"Unexpected keys: {status.unexpected_keys}") - - -def convert_esm_checkpoint_to_pytorch( - model: str, pytorch_dump_folder_path: str, classification_head: bool, push_to_repo: str, auth_token: str -): - """ - Copy/paste/tweak esm's weights to our BERT structure. - """ - if model.startswith("esmfold"): - esm = MODEL_MAPPING[model]() - else: - esm, alphabet = MODEL_MAPPING[model]() - esm.eval() # disable dropout - - if model.startswith("esmfold"): - embed_dim = esm.esm.embed_dim - num_layers = esm.esm.num_layers - num_attention_heads = esm.esm.attention_heads - intermediate_size = 4 * embed_dim - token_dropout = esm.esm.token_dropout - emb_layer_norm_before = False # This code path does not exist in ESM-2 - position_embedding_type = "rotary" - is_folding_model = True - esmfold_config = EsmFoldConfig() - for key, val in esm.cfg.items(): - if hasattr(esmfold_config, key) and key != "trunk": - setattr(esmfold_config, key, val) - for key, val in esm.cfg.trunk.items(): - if hasattr(esmfold_config.trunk, key) and key != "structure_module": - setattr(esmfold_config.trunk, key, val) - for key, val in esm.cfg.trunk.structure_module.items(): - if hasattr(esmfold_config.trunk.structure_module, key): - setattr(esmfold_config.trunk.structure_module, key, val) - elif hasattr(esm, "args"): - # Indicates an ESM-1b or ESM-1v model - embed_dim = esm.args.embed_dim - num_layers = esm.args.layers - num_attention_heads = esm.args.attention_heads - intermediate_size = esm.args.ffn_embed_dim - token_dropout = esm.args.token_dropout - emb_layer_norm_before = True if esm.emb_layer_norm_before else False - position_embedding_type = "absolute" - is_folding_model = False - esmfold_config = None - else: - # Indicates an ESM-2 model - embed_dim = esm.embed_dim - num_layers = esm.num_layers - num_attention_heads = esm.attention_heads - intermediate_size = 4 * embed_dim # This is hardcoded in ESM-2 - token_dropout = esm.token_dropout - emb_layer_norm_before = False # This code path does not exist in ESM-2 - position_embedding_type = "rotary" - is_folding_model = False - esmfold_config = None - - if is_folding_model: - alphabet = esm.esm.alphabet - vocab_list = tuple(alphabet.all_toks) - mask_token_id = alphabet.mask_idx - pad_token_id = alphabet.padding_idx - - if is_folding_model: - original_esm_model = esm.esm - else: - original_esm_model = esm - - config = EsmConfig( - vocab_size=original_esm_model.embed_tokens.num_embeddings, - mask_token_id=mask_token_id, - hidden_size=embed_dim, - num_hidden_layers=num_layers, - num_attention_heads=num_attention_heads, - intermediate_size=intermediate_size, - max_position_embeddings=1026, - layer_norm_eps=1e-5, # PyTorch default used in fairseq - attention_probs_dropout_prob=0.0, - hidden_dropout_prob=0.0, - pad_token_id=pad_token_id, - emb_layer_norm_before=emb_layer_norm_before, - token_dropout=token_dropout, - position_embedding_type=position_embedding_type, - is_folding_model=is_folding_model, - esmfold_config=esmfold_config, - vocab_list=vocab_list, - ) - if classification_head: - config.num_labels = esm.classification_heads["mnli"].out_proj.weight.shape[0] - print("Our ESM config:", config) - - if model.startswith("esmfold"): - model_class = EsmForProteinFolding - elif classification_head: - model_class = EsmForSequenceClassification - else: - model_class = EsmForMaskedLM - model = model_class(config) - model.eval() - - # Now let's copy all the weights. - # Embeddings - model.esm.embeddings.word_embeddings.weight = original_esm_model.embed_tokens.weight - if position_embedding_type == "absolute": - model.esm.embeddings.position_embeddings.weight = original_esm_model.embed_positions.weight - - if config.emb_layer_norm_before: - model.esm.embeddings.layer_norm.weight = original_esm_model.emb_layer_norm_before.weight - model.esm.embeddings.layer_norm.bias = original_esm_model.emb_layer_norm_before.bias - - model.esm.encoder.emb_layer_norm_after.weight = original_esm_model.emb_layer_norm_after.weight - model.esm.encoder.emb_layer_norm_after.bias = original_esm_model.emb_layer_norm_after.bias - - for i in range(config.num_hidden_layers): - # Encoder: start of layer - layer: EsmLayer = model.esm.encoder.layer[i] - # esm_layer: TransformerSentenceEncoderLayer = original_esm_model.layers[i] - esm_layer = original_esm_model.layers[i] - - # self attention - self_attn: EsmSelfAttention = layer.attention.self - assert ( - esm_layer.self_attn.k_proj.weight.data.shape - == esm_layer.self_attn.q_proj.weight.data.shape - == esm_layer.self_attn.v_proj.weight.data.shape - == torch.Size((config.hidden_size, config.hidden_size)) - ) - - self_attn.query.weight.data = esm_layer.self_attn.q_proj.weight - self_attn.query.bias.data = esm_layer.self_attn.q_proj.bias - self_attn.key.weight.data = esm_layer.self_attn.k_proj.weight - self_attn.key.bias.data = esm_layer.self_attn.k_proj.bias - self_attn.value.weight.data = esm_layer.self_attn.v_proj.weight - self_attn.value.bias.data = esm_layer.self_attn.v_proj.bias - - if getattr(esm_layer.self_attn, "rot_emb", None) is not None: - # Matt: Although inv_freq is not a trainable weight, it is computed at model init and cached. - # During the training of ESM-2 the model was converted to float16 precision, which also converts - # the inv_freq tensor, and the loss of precision remains even if the model is loaded later as float32. - # If we recompute inv_freq without this loss of precision then we will get subtly different rotary - # embeddings, which are enough to cause significant discrepancies in model outputs. To avoid this, - # we make sure the new model copies the data from the old inv_freq. - self_attn.rotary_embeddings.inv_freq.data = esm_layer.self_attn.rot_emb.inv_freq - - # LayerNorm changes for pre-activation - layer.attention.LayerNorm.weight = esm_layer.self_attn_layer_norm.weight - layer.attention.LayerNorm.bias = esm_layer.self_attn_layer_norm.bias - layer.LayerNorm.weight = esm_layer.final_layer_norm.weight - layer.LayerNorm.bias = esm_layer.final_layer_norm.bias - - # self-attention output - self_output: EsmSelfOutput = layer.attention.output - assert self_output.dense.weight.shape == esm_layer.self_attn.out_proj.weight.shape - self_output.dense.weight = esm_layer.self_attn.out_proj.weight - self_output.dense.bias = esm_layer.self_attn.out_proj.bias - - # intermediate - intermediate: EsmIntermediate = layer.intermediate - assert intermediate.dense.weight.shape == esm_layer.fc1.weight.shape - intermediate.dense.weight = esm_layer.fc1.weight - intermediate.dense.bias = esm_layer.fc1.bias - - # output - bert_output: EsmOutput = layer.output - assert bert_output.dense.weight.shape == esm_layer.fc2.weight.shape - bert_output.dense.weight = esm_layer.fc2.weight - bert_output.dense.bias = esm_layer.fc2.bias - # end of layer - - if is_folding_model: - model.esm_s_combine.data = esm.esm_s_combine.data - model.af2_to_esm.data = esm.af2_to_esm.data - transfer_and_check_weights(esm.embedding, model.embedding) - transfer_and_check_weights(esm.esm_s_mlp, model.esm_s_mlp) - transfer_and_check_weights(esm.trunk, model.trunk) - transfer_and_check_weights(esm.distogram_head, model.distogram_head) - transfer_and_check_weights(esm.ptm_head, model.ptm_head) - transfer_and_check_weights(esm.lm_head, model.lm_head) - transfer_and_check_weights(esm.lddt_head, model.lddt_head) - - elif classification_head: - model.classifier.dense.weight = esm.esm.classification_heads["mnli"].dense.weight - model.classifier.dense.bias = esm.classification_heads["mnli"].dense.bias - model.classifier.out_proj.weight = esm.classification_heads["mnli"].out_proj.weight - model.classifier.out_proj.bias = esm.classification_heads["mnli"].out_proj.bias - else: - # LM Head - model.lm_head.dense.weight = esm.lm_head.dense.weight - model.lm_head.dense.bias = esm.lm_head.dense.bias - model.lm_head.layer_norm.weight = esm.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = esm.lm_head.layer_norm.bias - model.lm_head.decoder.weight = esm.lm_head.weight - model.lm_head.bias = esm.lm_head.bias - - # Contact prediction head - transfer_and_check_weights(esm.contact_head, model.esm.contact_head) - - # Prepare data (first 2 sequences from ESMStructuralSplitDataset superfamily / 4) - if is_folding_model: - # Folding models aren't trained on masked inputs and don't like mask tokens. - sample_data = SAMPLE_DATA[:2] - else: - sample_data = SAMPLE_DATA - - if is_folding_model: - hf_tokenizer = get_esmfold_tokenizer() - hf_tokens = hf_tokenizer( - [row[1] for row in sample_data], return_tensors="pt", padding=True, add_special_tokens=False - ) - esmfold_aas, esmfold_mask, _, _, _ = esmfold_encode_sequences([row[1] for row in sample_data]) - success = torch.all(hf_tokens["input_ids"] == esmfold_aas) and torch.all( - hf_tokens["attention_mask"] == esmfold_mask - ) - else: - # Let's check that we get the same results. - batch_converter = alphabet.get_batch_converter() - batch_labels, batch_strs, batch_tokens = batch_converter(sample_data) - # Prepare tokenizer and make sure it matches - with TemporaryDirectory() as tempdir: - vocab = "\n".join(alphabet.all_toks) - vocab_file = Path(tempdir) / "vocab.txt" - vocab_file.write_text(vocab) - hf_tokenizer = EsmTokenizer(vocab_file=str(vocab_file)) - - hf_tokens = hf_tokenizer([row[1] for row in sample_data], return_tensors="pt", padding=True) - success = torch.all(hf_tokens["input_ids"] == batch_tokens) - - print("Do both models tokenizers output the same tokens?", "đŸ”„" if success else "đŸ’©") - if not success: - raise Exception("Tokenization does not match!") - - with torch.no_grad(): - if is_folding_model: - # Let's test the model in parts - # ESMFold always converts the ESM stem to float16, which requires float16 ops - # that don't exist on CPU. Therefore, to test it we need to run it on GPU. However, - # ESMFold is what we in the community call a "big boy" and so we desperately avoid putting both the - # original and the converted model on the GPU at the same time. - their_output = esm.cuda().infer([row[1] for row in sample_data]) - our_output = model.cuda()( - input_ids=hf_tokens["input_ids"].cuda(), attention_mask=hf_tokens["attention_mask"].cuda() - ) - else: - our_output = model(**hf_tokens, output_hidden_states=True) - our_output = our_output["logits"] - if classification_head: - their_output = esm.model.classification_heads["mnli"](esm.extract_features(batch_tokens)) - else: - their_output = esm(hf_tokens["input_ids"], repr_layers=list(range(999))) - their_output = their_output["logits"] - - if is_folding_model: - max_absolute_diff = torch.max(torch.abs(our_output["positions"] - their_output["positions"])).item() - success = torch.allclose(our_output["positions"], their_output["positions"], atol=1e-5) - else: - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - success = torch.allclose(our_output, their_output, atol=1e-5) - - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5 - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - - if not success: - raise Exception("Something went wRoNg") - - if not is_folding_model: - # Let's check contact prediction too - our_output = model.predict_contacts(hf_tokens["input_ids"], hf_tokens["attention_mask"]) - their_output = esm.predict_contacts(hf_tokens["input_ids"]) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - success = torch.allclose(our_output, their_output, atol=1e-5) - - print("Contact prediction testing:") - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-5 - print("Do both models output the same tensors?", "đŸ”„" if success else "đŸ’©") - - if not success: - raise Exception("Something went wRoNg") - - pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - del esm # Free up some memory before continuing - - print(f"Saving tokenizer to {pytorch_dump_folder_path}") - hf_tokenizer.save_pretrained(pytorch_dump_folder_path) - - if push_to_repo: - model.push_to_hub(repo_id=push_to_repo, token_token=auth_token) - hf_tokenizer.push_to_hub(repo_id=push_to_repo, token_token=auth_token) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--classification_head", action="store_true", help="Whether to convert a final classification head." - ) - parser.add_argument("--model", default=None, type=str, required=True, help="Name of model to convert.") - parser.add_argument("--push_to_repo", type=str, help="Repo to upload to (including username!).") - parser.add_argument("--auth_token", type=str, help="HuggingFace auth token.") - args = parser.parse_args() - convert_esm_checkpoint_to_pytorch( - args.model, args.pytorch_dump_folder_path, args.classification_head, args.push_to_repo, args.auth_token - ) diff --git a/src/transformers/models/falcon/convert_custom_code_checkpoint.py b/src/transformers/models/falcon/convert_custom_code_checkpoint.py deleted file mode 100644 index 0da817c3ff..0000000000 --- a/src/transformers/models/falcon/convert_custom_code_checkpoint.py +++ /dev/null @@ -1,74 +0,0 @@ -import json -from argparse import ArgumentParser -from pathlib import Path - - -""" -This script converts Falcon custom code checkpoints to modern Falcon checkpoints that use code in the Transformers -library. After conversion, performance (especially for generation) should improve and the checkpoint can be loaded -without needing trust_remote_code=True. -""" - -if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument( - "--checkpoint_dir", - type=Path, - required=True, - help="Directory containing a custom code checkpoint to convert to a modern Falcon checkpoint.", - ) - args = parser.parse_args() - - if not args.checkpoint_dir.is_dir(): - raise ValueError("--checkpoint_dir argument should be a directory!") - - if ( - not (args.checkpoint_dir / "configuration_RW.py").is_file() - or not (args.checkpoint_dir / "modelling_RW.py").is_file() - ): - raise ValueError( - "The model directory should contain configuration_RW.py and modelling_RW.py files! Are you sure this is a custom code checkpoint?" - ) - (args.checkpoint_dir / "configuration_RW.py").unlink() - (args.checkpoint_dir / "modelling_RW.py").unlink() - - config = args.checkpoint_dir / "config.json" - text = config.read_text() - text = text.replace("RWForCausalLM", "FalconForCausalLM") - text = text.replace("RefinedWebModel", "falcon") - text = text.replace("RefinedWeb", "falcon") - json_config = json.loads(text) - del json_config["auto_map"] - - if "n_head" in json_config: - json_config["num_attention_heads"] = json_config.pop("n_head") - if "n_layer" in json_config: - json_config["num_hidden_layers"] = json_config.pop("n_layer") - if "n_head_kv" in json_config: - json_config["num_kv_heads"] = json_config.pop("n_head_kv") - json_config["new_decoder_architecture"] = True - else: - json_config["new_decoder_architecture"] = False - bos_token_id = json_config.get("bos_token_id", 1) - eos_token_id = json_config.get("eos_token_id", 2) - config.unlink() - config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) - - tokenizer_config = args.checkpoint_dir / "tokenizer_config.json" - if tokenizer_config.is_file(): - text = tokenizer_config.read_text() - json_config = json.loads(text) - if json_config["tokenizer_class"] == "PreTrainedTokenizerFast": - json_config["model_input_names"] = ["input_ids", "attention_mask"] - tokenizer_config.unlink() - tokenizer_config.write_text(json.dumps(json_config, indent=2, sort_keys=True)) - - generation_config_path = args.checkpoint_dir / "generation_config.json" - generation_dict = { - "_from_model_config": True, - "bos_token_id": bos_token_id, - "eos_token_id": eos_token_id, - "transformers_version": "4.33.0.dev0", - } - generation_config_path.write_text(json.dumps(generation_dict, indent=2, sort_keys=True)) - print("Done! Please double-check that the new checkpoint works as expected.") diff --git a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 3a5bb2d2e2..0000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,210 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer checkpoint.""" - -import argparse -import json -import re -from pathlib import Path -from tempfile import TemporaryDirectory - -import torch -import yaml - -from transformers import ( - FastSpeech2ConformerConfig, - FastSpeech2ConformerModel, - FastSpeech2ConformerTokenizer, - logging, -) - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - -CONFIG_MAPPING = { - "adim": "hidden_size", - "aheads": "num_attention_heads", - "conformer_dec_kernel_size": "decoder_kernel_size", - "conformer_enc_kernel_size": "encoder_kernel_size", - "decoder_normalize_before": "decoder_normalize_before", - "dlayers": "decoder_layers", - "dunits": "decoder_linear_units", - "duration_predictor_chans": "duration_predictor_channels", - "duration_predictor_kernel_size": "duration_predictor_kernel_size", - "duration_predictor_layers": "duration_predictor_layers", - "elayers": "encoder_layers", - "encoder_normalize_before": "encoder_normalize_before", - "energy_embed_dropout": "energy_embed_dropout", - "energy_embed_kernel_size": "energy_embed_kernel_size", - "energy_predictor_chans": "energy_predictor_channels", - "energy_predictor_dropout": "energy_predictor_dropout", - "energy_predictor_kernel_size": "energy_predictor_kernel_size", - "energy_predictor_layers": "energy_predictor_layers", - "eunits": "encoder_linear_units", - "pitch_embed_dropout": "pitch_embed_dropout", - "pitch_embed_kernel_size": "pitch_embed_kernel_size", - "pitch_predictor_chans": "pitch_predictor_channels", - "pitch_predictor_dropout": "pitch_predictor_dropout", - "pitch_predictor_kernel_size": "pitch_predictor_kernel_size", - "pitch_predictor_layers": "pitch_predictor_layers", - "positionwise_conv_kernel_size": "positionwise_conv_kernel_size", - "postnet_chans": "speech_decoder_postnet_units", - "postnet_filts": "speech_decoder_postnet_kernel", - "postnet_layers": "speech_decoder_postnet_layers", - "reduction_factor": "reduction_factor", - "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor", - "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor", - "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate", - "transformer_dec_dropout_rate": "decoder_dropout_rate", - "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate", - "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate", - "transformer_enc_dropout_rate": "encoder_dropout_rate", - "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate", - "use_cnn_in_conformer": "use_cnn_in_conformer", - "use_macaron_style_in_conformer": "use_macaron_style_in_conformer", - "use_masking": "use_masking", - "use_weighted_masking": "use_weighted_masking", - "idim": "input_dim", - "odim": "num_mel_bins", - "spk_embed_dim": "speaker_embed_dim", - "langs": "num_languages", - "spks": "num_speakers", -} - - -def remap_model_yaml_config(yaml_config_path): - with Path(yaml_config_path).open("r", encoding="utf-8") as f: - args = yaml.safe_load(f) - args = argparse.Namespace(**args) - - remapped_config = {} - - model_params = args.tts_conf["text2mel_params"] - # espnet_config_key -> hf_config_key, any keys not included are ignored - for espnet_config_key, hf_config_key in CONFIG_MAPPING.items(): - if espnet_config_key in model_params: - remapped_config[hf_config_key] = model_params[espnet_config_key] - - return remapped_config, args.g2p, args.token_list - - -def convert_espnet_state_dict_to_hf(state_dict): - new_state_dict = {} - for key in state_dict: - if "tts.generator.text2mel." in key: - new_key = key.replace("tts.generator.text2mel.", "") - if "postnet" in key: - new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers") - new_key = new_key.replace(".0.weight", ".conv.weight") - new_key = new_key.replace(".1.weight", ".batch_norm.weight") - new_key = new_key.replace(".1.bias", ".batch_norm.bias") - new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean") - new_key = new_key.replace(".1.running_var", ".batch_norm.running_var") - new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked") - if "feat_out" in key: - if "weight" in key: - new_key = "speech_decoder_postnet.feat_out.weight" - if "bias" in key: - new_key = "speech_decoder_postnet.feat_out.bias" - if "encoder.embed.0.weight" in key: - new_key = new_key.replace("0.", "") - if "w_1" in key: - new_key = new_key.replace("w_1", "conv1") - if "w_2" in key: - new_key = new_key.replace("w_2", "conv2") - if "predictor.conv" in key: - new_key = new_key.replace(".conv", ".conv_layers") - pattern = r"(\d)\.(\d)" - replacement = ( - r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm" - ) - new_key = re.sub(pattern, replacement, new_key) - if "pitch_embed" in key or "energy_embed" in key: - new_key = new_key.replace("0", "conv") - if "encoders" in key: - new_key = new_key.replace("encoders", "conformer_layers") - new_key = new_key.replace("norm_final", "final_layer_norm") - new_key = new_key.replace("norm_mha", "self_attn_layer_norm") - new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm") - new_key = new_key.replace("norm_ff", "ff_layer_norm") - new_key = new_key.replace("norm_conv", "conv_layer_norm") - if "lid_emb" in key: - new_key = new_key.replace("lid_emb", "language_id_embedding") - if "sid_emb" in key: - new_key = new_key.replace("sid_emb", "speaker_id_embedding") - - new_state_dict[new_key] = state_dict[key] - - return new_state_dict - - -@torch.no_grad() -def convert_FastSpeech2ConformerModel_checkpoint( - checkpoint_path, - yaml_config_path, - pytorch_dump_folder_path, - repo_id=None, -): - model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path) - config = FastSpeech2ConformerConfig(**model_params) - - # Prepare the model - model = FastSpeech2ConformerModel(config) - - espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) - hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) - - model.load_state_dict(hf_compatible_state_dict) - - model.save_pretrained(pytorch_dump_folder_path) - - # Prepare the tokenizer - with TemporaryDirectory() as tempdir: - vocab = {token: id for id, token in enumerate(vocab)} - vocab_file = Path(tempdir) / "vocab.json" - with open(vocab_file, "w") as f: - json.dump(vocab, f) - should_strip_spaces = "no_space" in tokenizer_name - tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces) - - tokenizer.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - model.push_to_hub(repo_id) - tokenizer.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument( - "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" - ) - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - convert_FastSpeech2ConformerModel_checkpoint( - args.checkpoint_path, - args.yaml_config_path, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py deleted file mode 100644 index 70aada84bd..0000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py +++ /dev/null @@ -1,134 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer HiFi-GAN checkpoint.""" - -import argparse -from pathlib import Path - -import torch -import yaml - -from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - - -def load_weights(checkpoint, hf_model, config): - vocoder_key_prefix = "tts.generator.vocoder." - checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k} - - hf_model.apply_weight_norm() - - hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"] - hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"] - hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"] - - for i in range(len(config.upsample_rates)): - hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"] - hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"] - hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"] - - for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)): - for j in range(len(config.resblock_dilation_sizes)): - hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"] - hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"] - hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"] - - hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"] - hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"] - hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"] - - hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"] - hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"] - hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"] - - hf_model.remove_weight_norm() - - -def remap_hifigan_yaml_config(yaml_config_path): - with Path(yaml_config_path).open("r", encoding="utf-8") as f: - args = yaml.safe_load(f) - args = argparse.Namespace(**args) - - vocoder_type = args.tts_conf["vocoder_type"] - if vocoder_type != "hifigan_generator": - raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}") - - remapped_dict = {} - vocoder_params = args.tts_conf["vocoder_params"] - - # espnet_config_key -> hf_config_key - key_mappings = { - "channels": "upsample_initial_channel", - "in_channels": "model_in_dim", - "resblock_dilations": "resblock_dilation_sizes", - "resblock_kernel_sizes": "resblock_kernel_sizes", - "upsample_kernel_sizes": "upsample_kernel_sizes", - "upsample_scales": "upsample_rates", - } - for espnet_config_key, hf_config_key in key_mappings.items(): - remapped_dict[hf_config_key] = vocoder_params[espnet_config_key] - remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"] - remapped_dict["normalize_before"] = False - remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"] - - return remapped_dict - - -@torch.no_grad() -def convert_hifigan_checkpoint( - checkpoint_path, - pytorch_dump_folder_path, - yaml_config_path=None, - repo_id=None, -): - if yaml_config_path is not None: - config_kwargs = remap_hifigan_yaml_config(yaml_config_path) - config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) - else: - config = FastSpeech2ConformerHifiGanConfig() - - model = FastSpeech2ConformerHifiGan(config) - - orig_checkpoint = torch.load(checkpoint_path, weights_only=True) - load_weights(orig_checkpoint, model, config) - - model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert") - parser.add_argument( - "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - convert_hifigan_checkpoint( - args.checkpoint_path, - args.pytorch_dump_folder_path, - args.yaml_config_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py deleted file mode 100644 index 6f840438dc..0000000000 --- a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FastSpeech2Conformer checkpoint.""" - -import argparse - -import torch - -from transformers import ( - FastSpeech2ConformerConfig, - FastSpeech2ConformerHifiGan, - FastSpeech2ConformerHifiGanConfig, - FastSpeech2ConformerModel, - FastSpeech2ConformerWithHifiGan, - FastSpeech2ConformerWithHifiGanConfig, - logging, -) - -from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import ( - convert_espnet_state_dict_to_hf, - remap_model_yaml_config, -) -from .convert_hifigan import load_weights, remap_hifigan_yaml_config - - -logging.set_verbosity_info() -logger = logging.get_logger("transformers.models.FastSpeech2Conformer") - - -def convert_FastSpeech2ConformerWithHifiGan_checkpoint( - checkpoint_path, - yaml_config_path, - pytorch_dump_folder_path, - repo_id=None, -): - # Prepare the model - model_params, *_ = remap_model_yaml_config(yaml_config_path) - model_config = FastSpeech2ConformerConfig(**model_params) - - model = FastSpeech2ConformerModel(model_config) - - espnet_checkpoint = torch.load(checkpoint_path, weights_only=True) - hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) - model.load_state_dict(hf_compatible_state_dict) - - # Prepare the vocoder - config_kwargs = remap_hifigan_yaml_config(yaml_config_path) - vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) - - vocoder = FastSpeech2ConformerHifiGan(vocoder_config) - load_weights(espnet_checkpoint, vocoder, vocoder_config) - - # Prepare the model + vocoder - config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config) - with_hifigan_model = FastSpeech2ConformerWithHifiGan(config) - with_hifigan_model.model = model - with_hifigan_model.vocoder = vocoder - - with_hifigan_model.save_pretrained(pytorch_dump_folder_path) - - if repo_id: - print("Pushing to the hub...") - with_hifigan_model.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") - parser.add_argument( - "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" - ) - parser.add_argument( - "--pytorch_dump_folder_path", - required=True, - default=None, - type=str, - help="Path to the output `FastSpeech2ConformerModel` PyTorch model.", - ) - parser.add_argument( - "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the đŸ€— hub." - ) - - args = parser.parse_args() - - convert_FastSpeech2ConformerWithHifiGan_checkpoint( - args.checkpoint_path, - args.yaml_config_path, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py b/src/transformers/models/flava/convert_dalle_to_flava_codebook.py deleted file mode 100644 index 6408d0e1df..0000000000 --- a/src/transformers/models/flava/convert_dalle_to_flava_codebook.py +++ /dev/null @@ -1,102 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers import FlavaImageCodebook, FlavaImageCodebookConfig - - -def rreplace(s, old, new, occurrence): - li = s.rsplit(old, occurrence) - return new.join(li) - - -def count_parameters(state_dict): - # encoder.embeddings are double copied in original FLAVA - return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items()) - - -def upgrade_state_dict(state_dict): - upgrade = {} - - group_keys = ["group_1", "group_2", "group_3", "group_4"] - for key, value in state_dict.items(): - for group_key in group_keys: - if group_key in key: - key = key.replace(f"{group_key}.", f"{group_key}.group.") - - if "res_path" in key: - key = key.replace("res_path.", "res_path.path.") - - if key.endswith(".w"): - key = rreplace(key, ".w", ".weight", 1) - if key.endswith(".b"): - key = rreplace(key, ".b", ".bias", 1) - - upgrade[key] = value.float() - - return upgrade - - -@torch.no_grad() -def convert_dalle_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, save_checkpoint=True): - """ - Copy/paste/tweak model's weights to transformers design. - """ - from dall_e import Encoder - - encoder = Encoder() - if os.path.exists(checkpoint_path): - ckpt = torch.load(checkpoint_path, weights_only=True) - else: - ckpt = torch.hub.load_state_dict_from_url(checkpoint_path) - - if isinstance(ckpt, Encoder): - ckpt = ckpt.state_dict() - encoder.load_state_dict(ckpt) - - if config_path is not None: - config = FlavaImageCodebookConfig.from_pretrained(config_path) - else: - config = FlavaImageCodebookConfig() - - hf_model = FlavaImageCodebook(config).eval() - state_dict = encoder.state_dict() - - hf_state_dict = upgrade_state_dict(state_dict) - hf_model.load_state_dict(hf_state_dict) - hf_state_dict = hf_model.state_dict() - hf_count = count_parameters(hf_state_dict) - state_dict_count = count_parameters(state_dict) - - assert torch.allclose(hf_count, state_dict_count, atol=1e-3) - - if save_checkpoint: - hf_model.save_pretrained(pytorch_dump_folder_path) - else: - return hf_state_dict - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_dalle_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py b/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py deleted file mode 100644 index 8b6e536a3a..0000000000 --- a/src/transformers/models/flava/convert_flava_original_pytorch_to_hf.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding=utf-8 -# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import os - -import torch - -from transformers import FlavaConfig, FlavaForPreTraining -from transformers.models.flava.convert_dalle_to_flava_codebook import convert_dalle_checkpoint - - -def count_parameters(state_dict): - # encoder.embeddings are double copied in original FLAVA - return sum(param.float().sum() if "encoder.embeddings" not in key else 0 for key, param in state_dict.items()) - - -def upgrade_state_dict(state_dict, codebook_state_dict): - upgrade = {} - - for key, value in state_dict.items(): - if "text_encoder.embeddings" in key or "image_encoder.embeddings" in key: - continue - - key = key.replace("heads.cmd.mim_head.cls.predictions", "mmm_image_head") - key = key.replace("heads.cmd.mlm_head.cls.predictions", "mmm_text_head") - key = key.replace("heads.cmd.itm_head.cls", "itm_head") - key = key.replace("heads.cmd.itm_head.pooler", "itm_head.pooler") - key = key.replace("heads.cmd.clip_head.logit_scale", "flava.logit_scale") - key = key.replace("heads.fairseq_mlm.cls.predictions", "mlm_head") - key = key.replace("heads.imagenet.mim_head.cls.predictions", "mim_head") - key = key.replace("mm_text_projection", "flava.text_to_mm_projection") - key = key.replace("mm_image_projection", "flava.image_to_mm_projection") - key = key.replace("image_encoder.module", "flava.image_model") - key = key.replace("text_encoder.module", "flava.text_model") - key = key.replace("mm_encoder.module.encoder.cls_token", "flava.multimodal_model.cls_token") - key = key.replace("mm_encoder.module", "flava.multimodal_model") - key = key.replace("text_projection", "flava.text_projection") - key = key.replace("image_projection", "flava.image_projection") - - upgrade[key] = value.float() - - for key, value in codebook_state_dict.items(): - upgrade[f"image_codebook.{key}"] = value - - return upgrade - - -@torch.no_grad() -def convert_flava_checkpoint(checkpoint_path, codebook_path, pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = FlavaConfig.from_pretrained(config_path) - else: - config = FlavaConfig() - - hf_model = FlavaForPreTraining(config).eval() - - codebook_state_dict = convert_dalle_checkpoint(codebook_path, None, save_checkpoint=False) - - if os.path.exists(checkpoint_path): - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - else: - state_dict = torch.hub.load_state_dict_from_url(checkpoint_path, map_location="cpu") - - hf_state_dict = upgrade_state_dict(state_dict, codebook_state_dict) - hf_model.load_state_dict(hf_state_dict) - hf_state_dict = hf_model.state_dict() - hf_count = count_parameters(hf_state_dict) - state_dict_count = count_parameters(state_dict) + count_parameters(codebook_state_dict) - - assert torch.allclose(hf_count, state_dict_count, atol=1e-3) - - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to flava checkpoint") - parser.add_argument("--codebook_path", default=None, type=str, help="Path to flava codebook checkpoint") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - - convert_flava_checkpoint(args.checkpoint_path, args.codebook_path, args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py deleted file mode 100644 index 71660354db..0000000000 --- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FNet checkpoint.""" - -import argparse - -import torch -from flax.training.checkpoints import restore_checkpoint - -from transformers import FNetConfig, FNetForPreTraining -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_flax_checkpoint_to_pytorch(flax_checkpoint_path, fnet_config_file, save_path): - # Initialise PyTorch model - config = FNetConfig.from_json_file(fnet_config_file) - print(f"Building PyTorch model from configuration: {config}") - fnet_pretraining_model = FNetForPreTraining(config) - - checkpoint_dict = restore_checkpoint(flax_checkpoint_path, None) - pretrained_model_params = checkpoint_dict["target"] - - # Embeddings - # Position IDs - state_dict = fnet_pretraining_model.state_dict() - - position_ids = state_dict["fnet.embeddings.position_ids"] - new_state_dict = {"fnet.embeddings.position_ids": position_ids} - # Embedding Layers - new_state_dict["fnet.embeddings.word_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["word"]["embedding"] - ) - new_state_dict["fnet.embeddings.position_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["position"]["embedding"][0] - ) - new_state_dict["fnet.embeddings.token_type_embeddings.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["type"]["embedding"] - ) - new_state_dict["fnet.embeddings.projection.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["kernel"] - ).T - new_state_dict["fnet.embeddings.projection.bias"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["hidden_mapping_in"]["bias"] - ) - new_state_dict["fnet.embeddings.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["layer_norm"]["scale"] - ) - new_state_dict["fnet.embeddings.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["layer_norm"]["bias"] - ) - - # Encoder Layers - for layer in range(config.num_hidden_layers): - new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["scale"] - ) - new_state_dict[f"fnet.encoder.layer.{layer}.fourier.output.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["mixing_layer_norm"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["kernel"] - ).T - new_state_dict[f"fnet.encoder.layer.{layer}.intermediate.dense.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["intermediate"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["kernel"] - ).T - new_state_dict[f"fnet.encoder.layer.{layer}.output.dense.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"feed_forward_{layer}"]["output"]["bias"] - ) - - new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["scale"] - ) - new_state_dict[f"fnet.encoder.layer.{layer}.output.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["encoder"][f"encoder_{layer}"]["output_layer_norm"]["bias"] - ) - - # Pooler Layers - new_state_dict["fnet.pooler.dense.weight"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["kernel"]).T - new_state_dict["fnet.pooler.dense.bias"] = torch.tensor(pretrained_model_params["encoder"]["pooler"]["bias"]) - - # Masked LM Layers - new_state_dict["cls.predictions.transform.dense.weight"] = torch.tensor( - pretrained_model_params["predictions_dense"]["kernel"] - ).T - new_state_dict["cls.predictions.transform.dense.bias"] = torch.tensor( - pretrained_model_params["predictions_dense"]["bias"] - ) - new_state_dict["cls.predictions.transform.LayerNorm.weight"] = torch.tensor( - pretrained_model_params["predictions_layer_norm"]["scale"] - ) - new_state_dict["cls.predictions.transform.LayerNorm.bias"] = torch.tensor( - pretrained_model_params["predictions_layer_norm"]["bias"] - ) - new_state_dict["cls.predictions.decoder.weight"] = torch.tensor( - pretrained_model_params["encoder"]["embedder"]["word"]["embedding"] - ) - new_state_dict["cls.predictions.decoder.bias"] = torch.tensor( - pretrained_model_params["predictions_output"]["output_bias"] - ) - new_state_dict["cls.predictions.bias"] = torch.tensor(pretrained_model_params["predictions_output"]["output_bias"]) - - # Seq Relationship Layers - new_state_dict["cls.seq_relationship.weight"] = torch.tensor( - pretrained_model_params["classification"]["output_kernel"] - ) - new_state_dict["cls.seq_relationship.bias"] = torch.tensor( - pretrained_model_params["classification"]["output_bias"] - ) - - # Load State Dict - fnet_pretraining_model.load_state_dict(new_state_dict) - - # Save PreTrained - print(f"Saving pretrained model to {save_path}") - fnet_pretraining_model.save_pretrained(save_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--flax_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--fnet_config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained FNet model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument("--save_path", default=None, type=str, required=True, help="Path to the output model.") - args = parser.parse_args() - convert_flax_checkpoint_to_pytorch(args.flax_checkpoint_path, args.fnet_config_file, args.save_path) diff --git a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py b/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py deleted file mode 100644 index 4aed159280..0000000000 --- a/src/transformers/models/focalnet/convert_focalnet_to_hf_format.py +++ /dev/null @@ -1,237 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert FocalNet checkpoints from the original repository. URL: https://github.com/microsoft/FocalNet/tree/main""" - -import argparse -import json - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, FocalNetConfig, FocalNetForImageClassification -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling - - -def get_focalnet_config(model_name): - depths = [2, 2, 6, 2] if "tiny" in model_name else [2, 2, 18, 2] - use_conv_embed = True if "large" in model_name or "huge" in model_name else False - use_post_layernorm = True if "large" in model_name or "huge" in model_name else False - use_layerscale = True if "large" in model_name or "huge" in model_name else False - - if "large" in model_name or "xlarge" in model_name or "huge" in model_name: - if "fl3" in model_name: - focal_levels = [3, 3, 3, 3] - focal_windows = [5, 5, 5, 5] - elif "fl4" in model_name: - focal_levels = [4, 4, 4, 4] - focal_windows = [3, 3, 3, 3] - - if "tiny" in model_name or "small" in model_name or "base" in model_name: - focal_windows = [3, 3, 3, 3] - if "lrf" in model_name: - focal_levels = [3, 3, 3, 3] - else: - focal_levels = [2, 2, 2, 2] - - if "tiny" in model_name: - embed_dim = 96 - elif "small" in model_name: - embed_dim = 96 - elif "base" in model_name: - embed_dim = 128 - elif "large" in model_name: - embed_dim = 192 - elif "xlarge" in model_name: - embed_dim = 256 - elif "huge" in model_name: - embed_dim = 352 - - # set label information - repo_id = "huggingface/label-files" - if "large" in model_name or "huge" in model_name: - filename = "imagenet-22k-id2label.json" - else: - filename = "imagenet-1k-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - - config = FocalNetConfig( - embed_dim=embed_dim, - depths=depths, - focal_levels=focal_levels, - focal_windows=focal_windows, - use_conv_embed=use_conv_embed, - id2label=id2label, - label2id=label2id, - use_post_layernorm=use_post_layernorm, - use_layerscale=use_layerscale, - ) - - return config - - -def rename_key(name): - if "patch_embed.proj" in name: - name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") - if "patch_embed.norm" in name: - name = name.replace("patch_embed.norm", "embeddings.norm") - if "layers" in name: - name = "encoder." + name - if "encoder.layers" in name: - name = name.replace("encoder.layers", "encoder.stages") - if "downsample.proj" in name: - name = name.replace("downsample.proj", "downsample.projection") - if "blocks" in name: - name = name.replace("blocks", "layers") - if "modulation.f.weight" in name or "modulation.f.bias" in name: - name = name.replace("modulation.f", "modulation.projection_in") - if "modulation.h.weight" in name or "modulation.h.bias" in name: - name = name.replace("modulation.h", "modulation.projection_context") - if "modulation.proj.weight" in name or "modulation.proj.bias" in name: - name = name.replace("modulation.proj", "modulation.projection_out") - - if name == "norm.weight": - name = "layernorm.weight" - if name == "norm.bias": - name = "layernorm.bias" - - if "head" in name: - name = name.replace("head", "classifier") - else: - name = "focalnet." + name - - return name - - -def convert_focalnet_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - # fmt: off - model_name_to_url = { - "focalnet-tiny": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_srf.pth", - "focalnet-tiny-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_tiny_lrf.pth", - "focalnet-small": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_srf.pth", - "focalnet-small-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_small_lrf.pth", - "focalnet-base": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_srf.pth", - "focalnet-base-lrf": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_base_lrf.pth", - "focalnet-large-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384.pth", - "focalnet-large-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_large_lrf_384_fl4.pth", - "focalnet-xlarge-lrf-fl3": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384.pth", - "focalnet-xlarge-lrf-fl4": "https://projects4jw.blob.core.windows.net/focalnet/release/classification/focalnet_xlarge_lrf_384_fl4.pth", - } - # fmt: on - - checkpoint_url = model_name_to_url[model_name] - print("Checkpoint URL: ", checkpoint_url) - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - - # rename keys - for key in state_dict.copy().keys(): - val = state_dict.pop(key) - state_dict[rename_key(key)] = val - - config = get_focalnet_config(model_name) - model = FocalNetForImageClassification(config) - model.eval() - - # load state dict - model.load_state_dict(state_dict) - - # verify conversion - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - - processor = BitImageProcessor( - do_resize=True, - size={"shortest_edge": 256}, - resample=PILImageResampling.BILINEAR, - do_center_crop=True, - crop_size=224, - do_normalize=True, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - image = Image.open(requests.get(url, stream=True).raw) - inputs = processor(images=image, return_tensors="pt") - - image_transforms = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] - ) - - original_pixel_values = image_transforms(image).unsqueeze(0) - - # verify pixel_values - assert torch.allclose(inputs.pixel_values, original_pixel_values, atol=1e-4) - - outputs = model(**inputs) - - predicted_class_idx = outputs.logits.argmax(-1).item() - print("Predicted class:", model.config.id2label[predicted_class_idx]) - - print("First values of logits:", outputs.logits[0, :3]) - - if model_name == "focalnet-tiny": - expected_slice = torch.tensor([0.2166, -0.4368, 0.2191]) - elif model_name == "focalnet-tiny-lrf": - expected_slice = torch.tensor([1.1669, 0.0125, -0.1695]) - elif model_name == "focalnet-small": - expected_slice = torch.tensor([0.4917, -0.0430, 0.1341]) - elif model_name == "focalnet-small-lrf": - expected_slice = torch.tensor([-0.2588, -0.5342, -0.2331]) - elif model_name == "focalnet-base": - expected_slice = torch.tensor([-0.1655, -0.4090, -0.1730]) - elif model_name == "focalnet-base-lrf": - expected_slice = torch.tensor([0.5306, -0.0483, -0.3928]) - assert torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor of {model_name} to the hub...") - model.push_to_hub(f"{model_name}") - processor.push_to_hub(f"{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="focalnet-tiny", - type=str, - help="Name of the FocalNet model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub.", - ) - - args = parser.parse_args() - convert_focalnet_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100755 index c86afddc79..0000000000 --- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,280 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Note: if you intend to run this script make sure you look under scripts/fsmt/ -# to locate the appropriate script to do the work correctly. There is a set of scripts to: -# - download and prepare data and run the conversion script -# - perform eval to get the best hparam into the config -# - generate model_cards - useful if you have multiple models from the same paper - -import argparse -import json -import os -import re -from collections import OrderedDict -from os.path import basename, dirname - -import fairseq -import torch -from fairseq import hub_utils -from fairseq.data.dictionary import Dictionary - -from transformers import FSMTConfig, FSMTForConditionalGeneration -from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES -from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE -from transformers.utils import WEIGHTS_NAME, logging - - -logging.set_verbosity_warning() - -json_indent = 2 - -# based on the results of a search on a range of `num_beams`, `length_penalty` and `early_stopping` -# values against wmt19 test data to obtain the best BLEU scores, we will use the following defaults: -# -# * `num_beams`: 5 (higher scores better, but requires more memory/is slower, can be adjusted by users) -# * `early_stopping`: `False` consistently scored better -# * `length_penalty` varied, so will assign the best one depending on the model -best_score_hparams = { - # fairseq: - "wmt19-ru-en": {"length_penalty": 1.1}, - "wmt19-en-ru": {"length_penalty": 1.15}, - "wmt19-en-de": {"length_penalty": 1.0}, - "wmt19-de-en": {"length_penalty": 1.1}, - # allenai: - "wmt16-en-de-dist-12-1": {"length_penalty": 0.6}, - "wmt16-en-de-dist-6-1": {"length_penalty": 0.6}, - "wmt16-en-de-12-1": {"length_penalty": 0.8}, - "wmt19-de-en-6-6-base": {"length_penalty": 0.6}, - "wmt19-de-en-6-6-big": {"length_penalty": 0.6}, -} - -# this remaps the different models to their organization names -org_names = {} -for m in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]: - org_names[m] = "facebook" -for m in [ - "wmt16-en-de-dist-12-1", - "wmt16-en-de-dist-6-1", - "wmt16-en-de-12-1", - "wmt19-de-en-6-6-base", - "wmt19-de-en-6-6-big", -]: - org_names[m] = "allenai" - - -def rewrite_dict_keys(d): - # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up, - # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er': 7} - d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "", k), v) for k, v in d.items()) - keep_keys = " ".split() - # restore the special tokens - for k in keep_keys: - del d2[f"{k}"] - d2[k] = d[k] # restore - return d2 - - -def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path): - # prep - assert os.path.exists(fsmt_checkpoint_path) - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - print(f"Writing results to {pytorch_dump_folder_path}") - - # handle various types of models - - checkpoint_file = basename(fsmt_checkpoint_path) - fsmt_folder_path = dirname(fsmt_checkpoint_path) - - cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel - models = cls.hub_models() - kwargs = {"bpe": "fastbpe", "tokenizer": "moses"} - data_name_or_path = "." - # note: since the model dump is old, fairseq has upgraded its model some - # time later, and it does a whole lot of rewrites and splits on the saved - # weights, therefore we can't use torch.load() directly on the model file. - # see: upgrade_state_dict(state_dict) in fairseq_model.py - print(f"using checkpoint {checkpoint_file}") - chkpt = hub_utils.from_pretrained( - fsmt_folder_path, checkpoint_file, data_name_or_path, archive_map=models, **kwargs - ) - - args = vars(chkpt["args"]["model"]) - - src_lang = args["source_lang"] - tgt_lang = args["target_lang"] - - data_root = dirname(pytorch_dump_folder_path) - model_dir = basename(pytorch_dump_folder_path) - - # dicts - src_dict_file = os.path.join(fsmt_folder_path, f"dict.{src_lang}.txt") - tgt_dict_file = os.path.join(fsmt_folder_path, f"dict.{tgt_lang}.txt") - - src_dict = Dictionary.load(src_dict_file) - src_vocab = rewrite_dict_keys(src_dict.indices) - src_vocab_size = len(src_vocab) - src_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-src.json") - print(f"Generating {src_vocab_file} of {src_vocab_size} of {src_lang} records") - with open(src_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) - - # detect whether this is a do_lower_case situation, which can be derived by checking whether we - # have at least one uppercase letter in the source vocab - do_lower_case = True - for k in src_vocab.keys(): - if not k.islower(): - do_lower_case = False - break - - tgt_dict = Dictionary.load(tgt_dict_file) - tgt_vocab = rewrite_dict_keys(tgt_dict.indices) - tgt_vocab_size = len(tgt_vocab) - tgt_vocab_file = os.path.join(pytorch_dump_folder_path, "vocab-tgt.json") - print(f"Generating {tgt_vocab_file} of {tgt_vocab_size} of {tgt_lang} records") - with open(tgt_vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tgt_vocab, ensure_ascii=False, indent=json_indent)) - - # merges_file (bpecodes) - merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) - for fn in ["bpecodes", "code"]: # older fairseq called the merges file "code" - fsmt_merges_file = os.path.join(fsmt_folder_path, fn) - if os.path.exists(fsmt_merges_file): - break - with open(fsmt_merges_file, encoding="utf-8") as fin: - merges = fin.read() - merges = re.sub(r" \d+$", "", merges, 0, re.M) # remove frequency number - print(f"Generating {merges_file}") - with open(merges_file, "w", encoding="utf-8") as fout: - fout.write(merges) - - # model config - fsmt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") - - # validate bpe/tokenizer config, as currently it's hardcoded to moses+fastbpe - - # may have to modify the tokenizer if a different type is used by a future model - assert args["bpe"] == "fastbpe", f"need to extend tokenizer to support bpe={args['bpe']}" - assert args["tokenizer"] == "moses", f"need to extend tokenizer to support bpe={args['tokenizer']}" - - model_conf = { - "architectures": ["FSMTForConditionalGeneration"], - "model_type": "fsmt", - "activation_dropout": args["activation_dropout"], - "activation_function": "relu", - "attention_dropout": args["attention_dropout"], - "d_model": args["decoder_embed_dim"], - "dropout": args["dropout"], - "init_std": 0.02, - "max_position_embeddings": args["max_source_positions"], - "num_hidden_layers": args["encoder_layers"], - "src_vocab_size": src_vocab_size, - "tgt_vocab_size": tgt_vocab_size, - "langs": [src_lang, tgt_lang], - "encoder_attention_heads": args["encoder_attention_heads"], - "encoder_ffn_dim": args["encoder_ffn_embed_dim"], - "encoder_layerdrop": args["encoder_layerdrop"], - "encoder_layers": args["encoder_layers"], - "decoder_attention_heads": args["decoder_attention_heads"], - "decoder_ffn_dim": args["decoder_ffn_embed_dim"], - "decoder_layerdrop": args["decoder_layerdrop"], - "decoder_layers": args["decoder_layers"], - "bos_token_id": 0, - "pad_token_id": 1, - "eos_token_id": 2, - "is_encoder_decoder": True, - "scale_embedding": not args["no_scale_embedding"], - "tie_word_embeddings": args["share_all_embeddings"], - } - - # good hparam defaults to start with - model_conf["num_beams"] = 5 - model_conf["early_stopping"] = False - if model_dir in best_score_hparams and "length_penalty" in best_score_hparams[model_dir]: - model_conf["length_penalty"] = best_score_hparams[model_dir]["length_penalty"] - else: - model_conf["length_penalty"] = 1.0 - - print(f"Generating {fsmt_model_config_file}") - with open(fsmt_model_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) - - # tokenizer config - fsmt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) - - tokenizer_conf = { - "langs": [src_lang, tgt_lang], - "model_max_length": 1024, - "do_lower_case": do_lower_case, - } - - print(f"Generating {fsmt_tokenizer_config_file}") - with open(fsmt_tokenizer_config_file, "w", encoding="utf-8") as f: - f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) - - # model - model = chkpt["models"][0] - model_state_dict = model.state_dict() - - # rename keys to start with 'model.' - model_state_dict = OrderedDict(("model." + k, v) for k, v in model_state_dict.items()) - - # remove unneeded keys - ignore_keys = [ - "model.model", - "model.encoder.version", - "model.decoder.version", - "model.encoder_embed_tokens.weight", - "model.decoder_embed_tokens.weight", - "model.encoder.embed_positions._float_tensor", - "model.decoder.embed_positions._float_tensor", - ] - for k in ignore_keys: - model_state_dict.pop(k, None) - - config = FSMTConfig.from_pretrained(pytorch_dump_folder_path) - model_new = FSMTForConditionalGeneration(config) - - # check that it loads ok - model_new.load_state_dict(model_state_dict, strict=False) - - # save - pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) - print(f"Generating {pytorch_weights_dump_path}") - torch.save(model_state_dict, pytorch_weights_dump_path) - - print("Conversion is done!") - print("\nLast step is to upload the files to s3") - print(f"cd {data_root}") - print(f"transformers upload {model_dir}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--fsmt_checkpoint_path", - default=None, - type=str, - required=True, - help=( - "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts," - " bpecodes, etc." - ), - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_fsmt_checkpoint_to_pytorch(args.fsmt_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 4eab188f2a..0000000000 --- a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,64 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Funnel checkpoint.""" - -import argparse - -import torch - -from transformers import FunnelBaseModel, FunnelConfig, FunnelModel, load_tf_weights_in_funnel -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, base_model): - # Initialise PyTorch model - config = FunnelConfig.from_json_file(config_file) - print(f"Building PyTorch model from configuration: {config}") - model = FunnelBaseModel(config) if base_model else FunnelModel(config) - - # Load weights from tf checkpoint - load_tf_weights_in_funnel(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \nThis specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--base_model", action="store_true", help="Whether you want just the base model (no decoder) or not." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch( - args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.base_model - ) diff --git a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py b/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py deleted file mode 100644 index 29ef7859c9..0000000000 --- a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import sys -import warnings - -import flatdict -import torch - -from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer - - -try: - from transformers import LlamaTokenizerFast - - tokenizer_class = LlamaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - tokenizer_class = LlamaTokenizer - -""" -Sample usage: # TODO fix clone links from persimmon to fuyu -``` -git clone https://github.com/adept-ai-labs/adept-inference -wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar -wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar -python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import FuyuForCausalLM, FuyuTokenizer - -model = FuyuForCausalLM.from_pretrained("/output/path") -tokenizer = FuyuTokenizer.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - - -KEYS_TO_MODIFY_MAPPING = { - "self_attention": "self_attn", - "language_model.encoder": "language_model.model", - "word_embeddings_for_head": "language_model.lm_head", - "language_model.embedding.word_embeddings": "language_model.model.embed_tokens", - "vit_encoder.linear_encoder": "vision_embed_tokens", -} - -KEYS_TO_REMOVE = { - "rotary_emb.inv_freq", - "image_patch_projection", - "image_patch_projection.weight", - "image_patch_projection.bias", -} - - -def rename_state_dict(state_dict): - model_state_dict = {} - for key, value in state_dict.items(): - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - # if KEYS_TO_REMOVE in key: - if key in KEYS_TO_REMOVE: - continue - model_state_dict[key] = value - return model_state_dict - - -def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False): - sys.path.insert(0, ada_lib_path) - model_state_dict_base = torch.load(pt_model_path, map_location="cpu", weights_only=True) - state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".") - state_dict = rename_state_dict(state_dict) - - transformers_config = FuyuConfig() - model = FuyuForCausalLM(transformers_config).to(torch.bfloat16) - model.load_state_dict(state_dict) - model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization) - transformers_config.save_pretrained(pytorch_dump_folder_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - help="Location of Fuyu weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--pt_model_path", - help="Location of Fuyu `model_optim_rng.pt`", - ) - parser.add_argument( - "--output_dir", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--ada_lib_path", - help="Location of original source code from adept to deserialize .pt checkpoint", - ) - parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.") - args = parser.parse_args() - spm_path = os.path.join(args.input_dir, "adept_vocab.model") - - convert_fuyu_checkpoint( - pytorch_dump_folder_path=args.output_dir, - pt_model_path=args.pt_model_path, - safe_serialization=args.safe_serialization, - ada_lib_path=args.ada_lib_path, - ) - tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|") - tokenizer.save_pretrained(args.output_dir) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py b/src/transformers/models/gemma/convert_gemma_weights_to_hf.py deleted file mode 100644 index fd275c157f..0000000000 --- a/src/transformers/models/gemma/convert_gemma_weights_to_hf.py +++ /dev/null @@ -1,206 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import warnings - -import torch -from accelerate import init_empty_weights - -from transformers import GemmaConfig, GemmaForCausalLM, GemmaTokenizer - - -try: - from transformers import GemmaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - GemmaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \ - --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import GemmaForCausalLM, GemmaTokenizerFast - -model = GemmaForCausalLM.from_pretrained("/output/path") -tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -gemma_2b_config = GemmaConfig( - num_hidden_layers=18, - num_attention_heads=8, - num_key_value_heads=1, - hidden_size=2048, - intermediate_size=16384, -) - -gemma_7b_config = GemmaConfig() - -CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config} -LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} - - -def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): - num_attn_heads = config.num_attention_heads - hidden_size = config.hidden_size - num_kv_heads = config.num_key_value_heads - head_dim = config.head_dim - - print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") - model_state_dict = torch.load(input_base_path, map_location="cpu", weights_only=True)["model_state_dict"] - model_state_dict.pop("freqs_cis") - - state_dict = {} - for k, v in model_state_dict.items(): - if "qkv_proj" in k: - if num_kv_heads == 1: - v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size) - q_proj = v[:num_attn_heads, ...] - k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1) - v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1) - - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone() - else: - q_proj, k_proj, v_proj = torch.split(v, v.shape[0] // 3, 0) - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.clone() - - elif k == "embedder.weight": - state_dict[LAYER_NAME_MAPPING[k]] = v - state_dict["lm_head.weight"] = v - else: - state_dict[k] = v - - torch.set_default_dtype(dtype) - - print("Loading the checkpoint in a Gemma model.") - with init_empty_weights(): - model = GemmaForCausalLM(config) - model.load_state_dict(state_dict, assign=True, strict=False) - - model.config.torch_dtype = torch.float32 - del model.config._name_or_path - print("Saving in the Transformers format.") - - if push_to_hub: - print(f"pushing the model to {save_path}") - model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True) - else: - model.save_pretrained(save_path, safe_serialization=safe_serialization) - - -def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): - # Initialize the tokenizer based on the `spm` model - tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast - print(f"Saving a {tokenizer_class.__name__} to {save_path}.") - tokenizer = tokenizer_class(input_tokenizer_path) - if push_to_hub: - tokenizer.push_to_hub(save_path) - else: - tokenizer.save_pretrained(save_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_checkpoint", - help="Absolute path to the target Gemma weights.", - required=True, - ) - parser.add_argument( - "--tokenizer_checkpoint", - help="Location of Gemma tokenizer model", - ) - parser.add_argument( - "--model_size", - default="7B", - choices=["2B", "7B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", - ) - parser.add_argument( - "--output_dir", - default="google/gemma-7b", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--pickle_serialization", - help="Whether or not to save using `safetensors`.", - action="store_true", - default=False, - ) - parser.add_argument( - "--convert_tokenizer", - help="Whether or not to convert the tokenizer as well.", - action="store_true", - default=False, - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--dtype", - default="float32", - help="Target dtype of the converted model", - ) - args = parser.parse_args() - - if args.convert_tokenizer: - if args.tokenizer_checkpoint is None: - raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") - - spm_path = os.path.join(args.tokenizer_checkpoint) - write_tokenizer(spm_path, args.output_dir, args.push_to_hub) - - config = CONFIG_MAPPING[args.model_size] - dtype = getattr(torch, args.dtype) - write_model( - config=config, - input_base_path=args.input_checkpoint, - save_path=args.output_dir, - safe_serialization=not args.pickle_serialization, - push_to_hub=args.push_to_hub, - dtype=dtype, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py b/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py deleted file mode 100644 index c41f9a2fdb..0000000000 --- a/src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import warnings - -import torch -from accelerate import init_empty_weights - -from transformers import Gemma2Config, Gemma2ForCausalLM, GemmaTokenizer - - -try: - from transformers import GemmaTokenizerFast -except ImportError as e: - warnings.warn(e) - warnings.warn( - "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" - ) - GemmaTokenizerFast = None - -""" -Sample usage: - -``` -python src/transformers/models/gemma2/convert_gemma2_weights_to_hf.py \ - --input_dir /path/to/downloaded/gemma/weights --model_size 9B --output_dir /output/path -``` - -Thereafter, models can be loaded via: - -```py -from transformers import Gemma2ForCausalLM, GemmaTokenizerFast - -model = Gemma2ForCausalLM.from_pretrained("/output/path") -tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") -``` - -Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). -""" - -gemma_9b_config = Gemma2Config( - num_hidden_layers=42, - num_attention_heads=16, - num_key_value_heads=8, - hidden_size=3584, - intermediate_size=14336, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - head_dim=256, - sliding_window=4096, - query_pre_attn_scalar=224, -) - -gemma_27b_config = Gemma2Config( - num_hidden_layers=46, - num_attention_heads=32, - num_key_value_heads=16, - hidden_size=4608, - intermediate_size=36864, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - head_dim=128, - sliding_window=4096, - query_pre_attn_scalar=144, -) - -CONFIG_MAPPING = {"9B": gemma_9b_config, "27B": gemma_27b_config} -LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} - - -def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): - num_attn_heads = config.num_attention_heads - hidden_size = config.hidden_size - num_kv_heads = config.num_key_value_heads - head_dim = config.head_dim - - print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") - - if os.path.isdir(input_base_path): - print("Model seems sharded") - - model_state_dict = {} - files = [file for file in os.listdir(input_base_path) if file.endswith(".bin")] - - for file in files: - print(file) - loaded_state_dict = torch.load(os.path.join(input_base_path, file), map_location="cpu", weights_only=True) - model_state_dict.update(loaded_state_dict) - else: - print("Model does not seem to be sharded") - model_state_dict = torch.load(input_base_path, map_location="cpu", weights_only=True)["model_state_dict"] - model_state_dict.pop("freqs_cis") - - state_dict = {} - for k, v in model_state_dict.items(): - if "qkv_proj" in k: - if num_kv_heads == 1: - v = v.reshape(num_attn_heads + num_kv_heads * 2, head_dim, hidden_size) - q_proj = v[:num_attn_heads, ...] - k_proj = v[num_attn_heads : num_attn_heads + num_kv_heads, ...].repeat(num_kv_heads, 1, 1) - v_proj = v[-num_kv_heads:, ...].repeat(num_kv_heads, 1, 1) - - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj[0].clone() - else: - q_proj, k_proj, v_proj = torch.split( - v, [num_attn_heads * head_dim, num_kv_heads * head_dim, num_kv_heads * head_dim], 0 - ) - state_dict[k.replace("qkv_proj", "q_proj")] = q_proj.reshape( - num_attn_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "k_proj")] = k_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - state_dict[k.replace("qkv_proj", "v_proj")] = v_proj.reshape( - num_kv_heads * head_dim, hidden_size - ).clone() - - elif k == "embedder.weight": - state_dict[LAYER_NAME_MAPPING[k]] = v - state_dict["lm_head.weight"] = v - else: - state_dict[k] = v - - torch.set_default_dtype(dtype) - - print("Loading the checkpoint in a Gemma2 model.") - with init_empty_weights(): - model = Gemma2ForCausalLM(config) - model.load_state_dict(state_dict, assign=True, strict=False) - - model.config.torch_dtype = torch.float32 - del model.config._name_or_path - print("Saving in the Transformers format.") - - if push_to_hub: - print(f"pushing the model to {save_path}") - model.push_to_hub(save_path, safe_serialization=safe_serialization, private=True) - else: - model.save_pretrained(save_path, safe_serialization=safe_serialization) - - -def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): - # Initialize the tokenizer based on the `spm` model - tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast - print(f"Saving a {tokenizer_class.__name__} to {save_path}.") - tokenizer = tokenizer_class(input_tokenizer_path) - if push_to_hub: - tokenizer.push_to_hub(save_path) - else: - tokenizer.save_pretrained(save_path) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_checkpoint", - help="Absolute path to the target Gemma2 weights.", - required=True, - ) - parser.add_argument( - "--tokenizer_checkpoint", - help="Location of Gemma2 tokenizer model", - ) - parser.add_argument( - "--model_size", - default="9B", - choices=["9B", "27B", "tokenizer_only"], - help="'f' models correspond to the finetuned versions, and are specific to the Gemma22 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", - ) - parser.add_argument( - "--output_dir", - default="google/gemma-9b", - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--pickle_serialization", - help="Whether or not to save using `safetensors`.", - action="store_true", - default=False, - ) - parser.add_argument( - "--convert_tokenizer", - help="Whether or not to convert the tokenizer as well.", - action="store_true", - default=False, - ) - parser.add_argument( - "--push_to_hub", - help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", - action="store_true", - default=False, - ) - parser.add_argument( - "--dtype", - default="float32", - help="Target dtype of the converted model", - ) - args = parser.parse_args() - - if args.convert_tokenizer: - if args.tokenizer_checkpoint is None: - raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") - - spm_path = os.path.join(args.tokenizer_checkpoint) - write_tokenizer(spm_path, args.output_dir, args.push_to_hub) - if not args.model_size == "tokenizer_only": - config = CONFIG_MAPPING[args.model_size] - dtype = getattr(torch, args.dtype) - write_model( - config=config, - input_base_path=args.input_checkpoint, - save_path=args.output_dir, - safe_serialization=not args.pickle_serialization, - push_to_hub=args.push_to_hub, - dtype=dtype, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gemma3/convert_gemma3_weights_orbax_to_hf.py b/src/transformers/models/gemma3/convert_gemma3_weights_orbax_to_hf.py deleted file mode 100644 index b9b6a66b76..0000000000 --- a/src/transformers/models/gemma3/convert_gemma3_weights_orbax_to_hf.py +++ /dev/null @@ -1,594 +0,0 @@ -# coding=utf-8 -# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -r"""Utility to convert Gemma models from Orbax to HF Transformers checkpoint. - -python -m transformers.models.gemma3.convert_gemma3_weights_orbax_to_hf \ - --variant='gemma3_4b' \ - --tokenizer_path="$HOME/gemma3/tokenizer/gemma3_cleaned_262144_v2.spiece.model" \ - --checkpoint_path="$HOME/gemma3/gemma3_4b_pt_orbax/" \ - --output_path="$HOME/gemma3/gemma3_4b_pt_safetensors/" -""" - -from collections.abc import Iterator, Sequence -from typing import Any - -import accelerate -import numpy as np -import torch -import tree -from absl import app, flags, logging -from orbax import checkpoint as obc - -from transformers import ( - Gemma3Config, - Gemma3ForCausalLM, - Gemma3ForConditionalGeneration, - Gemma3ImageProcessor, - Gemma3Processor, - Gemma3TextConfig, - GemmaTokenizerFast, - GenerationConfig, - SiglipVisionConfig, -) -from transformers.image_utils import PILImageResampling - - -# ==== Internal Constants and Classes ==== - - -_CHAT_TEMPLATE = """{{ bos_token }} -{%- if messages[0]['role'] == 'system' -%} - {%- if messages[0]['content'] is string -%} - {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%} - {%- else -%} - {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%} - {%- endif -%} - {%- set loop_messages = messages[1:] -%} -{%- else -%} - {%- set first_user_prefix = "" -%} - {%- set loop_messages = messages -%} -{%- endif -%} -{%- for message in loop_messages -%} - {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} - {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} - {%- endif -%} - {%- if (message['role'] == 'assistant') -%} - {%- set role = "model" -%} - {%- else -%} - {%- set role = message['role'] -%} - {%- endif -%} - {{ '' + role + '\n' + (first_user_prefix if loop.first else "") }} - {%- if message['content'] is string -%} - {{ message['content'] | trim }} - {%- elif message['content'] is iterable -%} - {%- for item in message['content'] -%} - {%- if item['type'] == 'image' -%} - {{ '' }} - {%- elif item['type'] == 'text' -%} - {{ item['text'] | trim }} - {%- endif -%} - {%- endfor -%} - {%- else -%} - {{ raise_exception("Invalid content type") }} - {%- endif -%} - {{ '\n' }} -{%- endfor -%} -{%- if add_generation_prompt -%} - {{'model\n'}} -{%- endif -%} -""" - -_DTYPES = {"float32", "bfloat16", "float16"} - -_SIGLIP_BASE = "SigLiPFromPatches_0/siglip_encoder" -_SIGLIP_EMBEDDING = "SigLiPFromPatches_0/siglip_encoder/embedding" -_SIGLIP_TRANSFORMER_ENCODER_BLOCK = "SigLiPFromPatches_0/siglip_encoder/Transformer/encoderblock_" -_SIGLIP_TRANSFORMER_ENCODER_BLOCK_LEN = len(_SIGLIP_TRANSFORMER_ENCODER_BLOCK) -_SIGLIP_TRANSFORMER_ENCODER_NORM = "SigLiPFromPatches_0/siglip_encoder/Transformer/encoder_norm" - -_TRANSFORMER_DECODER_BLOCK = "transformer/layer_" -_TRANSFORMER_DECODER_BLOCK_LEN = len(_TRANSFORMER_DECODER_BLOCK) -_TRANSFORMER_EMBEDDER = "transformer/embedder" -_TRANSFORMER_FINAL_NORM = "transformer/final_norm" -_TRANSFORMER_POST_TRAINING_PREFIX = "rlx_networks/policy_network/" -_TRANSFORMER_POST_TRAINING_PREFIX_LEN = len(_TRANSFORMER_POST_TRAINING_PREFIX) - -_VISION_CONFIG = { - "hidden_size": 1152, - "intermediate_size": 4304, - "num_hidden_layers": 27, - "num_attention_heads": 16, - "num_channels": 3, - "image_size": 896, - "patch_size": 14, - "hidden_act": "gelu_pytorch_tanh", - "layer_norm_eps": 1e-6, - "attention_dropout": 0.0, - "vision_use_head": False, -} - -_VARIANT_GEMMA_3_1B = "gemma3_1b" -_VARIANT_GEMMA_3_4B = "gemma3_4b" -_VARIANT_GEMMA_3_12B = "gemma3_12b" -_VARIANT_GEMMA_3_27B = "gemma3_27b" -_VARIANTS = { - _VARIANT_GEMMA_3_1B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_144, - hidden_size=1152, - intermediate_size=6 * 1152, - num_attention_heads=4, - num_hidden_layers=26, - num_key_value_heads=1, - head_dim=256, - sliding_window=512, - rope_theta=1_000_000, # used for global RoPE only - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=256, - max_position_embeddings=32_768, - ), - vision_config=None, - ), - _VARIANT_GEMMA_3_4B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_208, - hidden_size=2560, - intermediate_size=2560 * 8 // 2, - num_attention_heads=8, - head_dim=256, - num_hidden_layers=34, - num_key_value_heads=4, - sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only - rope_theta=1_000_000, - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=256, - ), - vision_config=_VISION_CONFIG, - ), - _VARIANT_GEMMA_3_12B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_208, - hidden_size=30 * 128, - intermediate_size=30 * 128 * 8 // 2, - num_attention_heads=16, - head_dim=256, - num_hidden_layers=48, - num_key_value_heads=8, - sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only - rope_theta=1_000_000, - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=256, - ), - vision_config=_VISION_CONFIG, - ), - _VARIANT_GEMMA_3_27B: Gemma3Config( - text_config=Gemma3TextConfig( - vocab_size=262_208, - hidden_size=42 * 128, - intermediate_size=42 * 128 * 8 // 2, - num_attention_heads=32, - num_hidden_layers=62, - num_key_value_heads=16, - head_dim=128, - sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only - rope_theta=1_000_000, - rope_local_base_freq=10_000, - attn_logit_softcapping=None, - query_pre_attn_scalar=(42 * 128 // 32), # 1 / sqrt(hidden_size // num_attention_heads) - ), - vision_config=_VISION_CONFIG, - ), -} - -# ==== Flags ==== - -_CHECKPOINT_PATH = flags.DEFINE_string( - name="checkpoint_path", - default=None, - help="Path to the Orbax checkpoint.", - required=True, -) - -_INCLUDE_CHAT_TEMPLATE = flags.DEFINE_bool( - name="include_chat_template", default=False, help="If true, will save the default chat template with the tokenizer" -) - -_OUTPUT_PATH = flags.DEFINE_string( - name="output_path", - default=None, - help="Path to store the HF checkpoint.", - required=True, -) - -_TRANSFORMER_DTYPE = flags.DEFINE_enum( - name="text_dtype", - default="bfloat16", - help="The floating point precision (aka dtype) of the model.", - enum_values=_DTYPES, -) - -_TOKENIZER_PATH = flags.DEFINE_string( - name="tokenizer_path", - default=None, - help="Path to the SentencePiece model file.", - required=True, -) - -_VARIANT = flags.DEFINE_enum( - name="variant", - default=_VARIANT_GEMMA_3_4B, - help="The model variant to convert.", - enum_values=set(_VARIANTS.keys()), -) - -_VERBOSE = flags.DEFINE_bool( - name="verbose", - default=False, - help="If true, log the path, shape, and dtype of every converted layer.", -) - -_VISION_DTYPE = flags.DEFINE_enum( - name="vision_dtype", - default="float32", - help="The floating point precision (aka dtype) of the model.", - enum_values=_DTYPES, -) - - -def convert_siglip_weight( - config: SiglipVisionConfig, - paths: Sequence[str], - weights: np.ndarray, -) -> tuple[str, np.ndarray]: - path, prop = paths - normalized_path: str = "" - updated_weights: np.ndarray = None - - if path == _SIGLIP_BASE: - normalized_path = "vision_tower.vision_model.embeddings.position_embedding.weight" - updated_weights = weights.reshape(-1, config.hidden_size) - elif path == _SIGLIP_EMBEDDING: - if prop == "kernel": - normalized_path = "vision_tower.vision_model.embeddings.patch_embedding.weight" - updated_weights = weights.transpose(3, 2, 0, 1) - elif prop == "bias": - normalized_path = "vision_tower.vision_model.embeddings.patch_embedding.bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.") - elif path.startswith(_SIGLIP_TRANSFORMER_ENCODER_BLOCK): - encoder_block_path = path[_SIGLIP_TRANSFORMER_ENCODER_BLOCK_LEN:] - next_path_seperator_idx = encoder_block_path.find("/") - layer_idx = encoder_block_path[:next_path_seperator_idx] - encoder_block_path = encoder_block_path[next_path_seperator_idx:] - normalized_path = f"vision_tower.vision_model.encoder.layers.{layer_idx}" - - if encoder_block_path.startswith("/LayerNorm"): - normalized_path += ".layer_norm1" if path.endswith("_0") else ".layer_norm2" - - if prop == "scale": - normalized_path += ".weight" - updated_weights = weights.transpose() - elif prop == "bias": - normalized_path += ".bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `scale`.") - elif encoder_block_path.startswith("/MlpBlock_0"): - normalized_path += ".mlp.fc1" if "/Dense_0" in encoder_block_path else ".mlp.fc2" - - if prop == "kernel": - normalized_path += ".weight" - updated_weights = weights.transpose() - elif prop == "bias": - normalized_path += ".bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.") - elif encoder_block_path.startswith("/MultiHeadDotProductAttention_0"): - if encoder_block_path.endswith("/key"): - normalized_path += ".self_attn.k_proj" - elif encoder_block_path.endswith("/out"): - normalized_path += ".self_attn.out_proj" - elif encoder_block_path.endswith("/query"): - normalized_path += ".self_attn.q_proj" - elif encoder_block_path.endswith("/value"): - normalized_path += ".self_attn.v_proj" - else: - raise ValueError(f"Unexpected path `{path}` in SigLIP Transformer MultiHeadDotProductAttention_0.") - - if prop == "bias": - normalized_path += ".bias" - updated_weights = weights.reshape(-1, config.hidden_size).reshape(-1) - elif prop == "kernel": - normalized_path += ".weight" - updated_weights = weights.reshape(-1, config.hidden_size).transpose() - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `kernel`.") - else: - raise ValueError(f"Unexpected path `{path}` in SigLIP Transformer Encoder Block.") - elif path == _SIGLIP_TRANSFORMER_ENCODER_NORM: - if prop == "scale": - normalized_path = "vision_tower.vision_model.post_layernorm.weight" - updated_weights = weights.transpose() - elif prop == "bias": - normalized_path = "vision_tower.vision_model.post_layernorm.bias" - updated_weights = weights - else: - raise ValueError(f"Unexpected member, `{prop}`, for path `{path}`. Should be `bias` or `scale`.") - else: - raise ValueError(f"Unexpected path `{path}`.") - - return normalized_path, updated_weights - - -def convert_transformer_weights( - config: Gemma3TextConfig, - paths: Sequence[str], - weights: np.ndarray, -) -> Iterator[tuple[str, np.ndarray]]: - path, prop = paths - - if path.startswith(_TRANSFORMER_POST_TRAINING_PREFIX): - path = path[_TRANSFORMER_POST_TRAINING_PREFIX_LEN:] - - converted_paths: list[str] = [] - converted_weights: list[Any] = [] - - attn_head_dim = config.num_attention_heads * config.head_dim - kv_head_dim = config.num_key_value_heads * config.head_dim - - if path == _TRANSFORMER_EMBEDDER: - if prop == "input_embedding": - # Tied to language_model.lm_head.weight, assigned at the end. - converted_paths = ["language_model.model.embed_tokens.weight"] - - if _VARIANT.value != _VARIANT_GEMMA_3_1B: - # Gemma3 model doesn't have image soft token in input and output embeddings, resize to avoid bugs we had with Mllama - pre_expansion_embeddings = weights - mu = np.mean(pre_expansion_embeddings, axis=0) - sigma = np.cov(pre_expansion_embeddings, rowvar=False, bias=True) - new_embeddings = np.random.multivariate_normal(mu, 1e-5 * sigma, size=64) - weights = np.vstack([pre_expansion_embeddings, new_embeddings]) - - converted_weights = [weights] - elif _VARIANT.value == _VARIANT_GEMMA_3_1B or prop in ("mm_output_embedding", "mm_input_embedding_extra"): - return zip([], []) - else: - raise ValueError(f"Unexpected member, {prop}, in Embedder.") - elif path.startswith(f"{_TRANSFORMER_EMBEDDER}/mm"): - if _VARIANT.value == _VARIANT_GEMMA_3_1B: - return zip([], []) - - if path.endswith("/mm_input_projection"): - converted_paths = ["multi_modal_projector.mm_input_projection_weight"] - converted_weights = [weights] - elif path.endswith("/mm_soft_embedding_norm"): - converted_paths = ["multi_modal_projector.mm_soft_emb_norm.weight"] - converted_weights = [weights] - else: - raise ValueError(f"Unexpected subpath, `{path}`, in Embedder.") - elif path == _TRANSFORMER_FINAL_NORM: - converted_paths = ["language_model.model.norm.weight"] - converted_weights = [weights] - elif path.startswith(_TRANSFORMER_DECODER_BLOCK): - decoder_block_path = path[_TRANSFORMER_DECODER_BLOCK_LEN:] - next_path_seperator_idx = decoder_block_path.find("/") - layer_idx = decoder_block_path[:next_path_seperator_idx] - decoder_block_path = decoder_block_path[next_path_seperator_idx:] - - base_path = f"language_model.model.layers.{layer_idx}" - - if path.endswith("attn/attn_vec_einsum"): - converted_paths = [f"{base_path}.self_attn.o_proj.weight"] - converted_weights = [weights.transpose(2, 0, 1).reshape(config.hidden_size, attn_head_dim)] - elif path.endswith("attn/_key_norm"): - converted_paths = [f"{base_path}.self_attn.k_norm.weight"] - converted_weights = [weights] - elif path.endswith("attn/kv_einsum"): - converted_paths = [ - f"{base_path}.self_attn.k_proj.weight", - f"{base_path}.self_attn.v_proj.weight", - ] - k_proj_weights, v_proj_weights = weights - converted_weights = [ - k_proj_weights.transpose(0, 2, 1).reshape(kv_head_dim, config.hidden_size), - v_proj_weights.transpose(0, 2, 1).reshape(kv_head_dim, config.hidden_size), - ] - elif path.endswith("attn/q_einsum"): - converted_paths = [f"{base_path}.self_attn.q_proj.weight"] - converted_weights = [weights.transpose(0, 2, 1).reshape(attn_head_dim, config.hidden_size)] - elif path.endswith("attn/_query_norm"): - converted_paths = [f"{base_path}.self_attn.q_norm.weight"] - converted_weights = [weights] - elif path.endswith("mlp/gating_einsum"): - converted_paths = [ - f"{base_path}.mlp.gate_proj.weight", - f"{base_path}.mlp.up_proj.weight", - ] - gate_proj_weight, up_proj_weight = weights - converted_weights = [gate_proj_weight, up_proj_weight] - elif path.endswith("mlp/linear"): - converted_paths = [f"{base_path}.mlp.down_proj.weight"] - converted_weights = [weights.transpose()] - elif path.endswith("post_attention_norm"): - converted_paths = [f"{base_path}.post_attention_layernorm.weight"] - converted_weights = [weights] - elif path.endswith("post_ffw_norm"): - converted_paths = [f"{base_path}.post_feedforward_layernorm.weight"] - converted_weights = [weights] - elif path.endswith("pre_attention_norm"): - converted_paths = [f"{base_path}.input_layernorm.weight"] - converted_weights = [weights] - elif path.endswith("pre_ffw_norm"): - converted_paths = [f"{base_path}.pre_feedforward_layernorm.weight"] - converted_weights = [weights] - else: - raise ValueError(f"Unexpected path `{path}` in Decoder Block.") - else: - raise ValueError(f"Unexpected path `{path}`.") - - if (cpl := len(converted_paths)) != (cwl := len(converted_weights)): - raise ValueError( - "The `converted_paths` and `converted_weights` should be the same " - f"length. Got {cpl} and {cwl}, respectively, for {path}." - ) - - return zip(converted_paths, converted_weights) - - -def convert(checkpoint_path: str, config: Gemma3Config) -> dict[str, torch.Tensor]: - """Loads Orbax checkpoint from `input_path` and converts it to HF tree.""" - checkpointer = obc.PyTreeCheckpointer() - ckpt = checkpointer.restore(checkpoint_path) - hf_tree: dict[str, torch.Tensor] = {} - - def update_tree(path: str, weights: np.ndarray, target_dtype: torch.dtype) -> None: - hf_tree[path] = torch.from_numpy(weights.astype("float32")).type(target_dtype) - if _VERBOSE.value: - logging.info( - "%s converted shape=%s with dtype=%s", - path, - weights.shape, - target_dtype, - ) - - for paths, value in tree.flatten_with_path(ckpt): - if paths[0].startswith("SigLiPFromPatches_"): - if config.vision_config is None: - continue - - path, weights = convert_siglip_weight(config=config.vision_config, paths=paths, weights=value) - update_tree(path, weights, config.vision_config.torch_dtype) - else: - for path, weights in convert_transformer_weights(config=config.text_config, paths=paths, weights=value): - if config.vision_config is None: - path = path[len("language_model.") :] - - update_tree(path, weights, config.text_config.torch_dtype) - - if config.vision_config is None: - hf_tree["lm_head.weight"] = hf_tree["model.embed_tokens.weight"] - else: - hf_tree["language_model.lm_head.weight"] = hf_tree["language_model.model.embed_tokens.weight"] - - return hf_tree - - -def main(*args): - del args - - output_path = _OUTPUT_PATH.value - variant = _VARIANT.value - - config = _VARIANTS[variant] - config.text_config.torch_dtype = getattr(torch, _TRANSFORMER_DTYPE.value) - - if variant == _VARIANT_GEMMA_3_1B: - config.vision_config = None - else: - config.vision_config.torch_dtype = getattr(torch, _VISION_DTYPE.value) - - if _INCLUDE_CHAT_TEMPLATE.value: - # Chat template is included for instruction tuned models, which treat - # both "" and "" as generation stoppers. - config.eos_token_id = [1, 106] - - logging.info( - "Converting Gemma 3 (%s) @ %s (language) and %s (vision)", - variant, - _TRANSFORMER_DTYPE.value, - _VISION_DTYPE.value, - ) - state_tree = convert(_CHECKPOINT_PATH.value, config) - logging.info("Converted Gemma 3 (%s) state tree from Orbax to Hugging Face.", variant) - - with accelerate.init_empty_weights(): - if variant == _VARIANT_GEMMA_3_1B: - model = Gemma3ForCausalLM(config=config.text_config) - else: - model = Gemma3ForConditionalGeneration(config) - - model.load_state_dict(state_tree, assign=True, strict=True) - logging.info( - "Loaded Gemma 3 (%s) in Hugging Face Transformers as a %s instance.", - variant, - type(model).__name__, - ) - model.save_pretrained(output_path, safe_serialization=True) - logging.info( - "Saved Gemma 3 (%s) to SafeTensors in %s using %s", - variant, - output_path, - type(model).__name__, - ) - del model - del state_tree - - tokenizer = GemmaTokenizerFast( - _TOKENIZER_PATH.value, - add_bos_token=True, - extra_special_tokens={ - "image_token": "", # Should be ID=262_144 - "boi_token": "", # Should be ID=255_999 - "eoi_token": "", # Should be ID=256_000 - }, - chat_template=_CHAT_TEMPLATE if _INCLUDE_CHAT_TEMPLATE.value else None, - ) - tokenizer.save_pretrained(output_path) - logging.info("Saved GemmaTokenizer for %s to %s", variant, output_path) - - if variant != _VARIANT_GEMMA_3_1B: - image_processor = Gemma3ImageProcessor( - image_seq_length=256, - image_mean=(0.5,) * 3, - image_std=(0.5,) * 3, - size={"height": 896, "width": 896}, - resample=PILImageResampling.BILINEAR, - ) - processor = Gemma3Processor( - image_processor=image_processor, - tokenizer=tokenizer, - chat_template=tokenizer.chat_template, - ) - processor.save_pretrained(output_path) - logging.info("Saved Gemma3Processor for %s to %s", variant, output_path) - del processor - - del tokenizer - - generation_config = GenerationConfig( - pad_token_id=config.pad_token_id, - bos_token_id=config.bos_token_id, - eos_token_id=config.eos_token_id, - cache_implementation="hybrid", - temperature=1.0, - do_sample=True, - top_k=64, - top_p=0.95, - ) - generation_config.save_pretrained(output_path) - - -if __name__ == "__main__": - app.run(main) diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py deleted file mode 100644 index 4a9d8a0159..0000000000 --- a/src/transformers/models/git/convert_git_to_pytorch.py +++ /dev/null @@ -1,448 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GIT checkpoints from the original repository. - -URL: https://github.com/microsoft/GenerativeImage2Text/tree/main""" - -import argparse -from pathlib import Path - -import av -import numpy as np -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor - -from transformers import ( - AutoTokenizer, - CLIPImageProcessor, - GitConfig, - GitForCausalLM, - GitProcessor, - GitVisionConfig, - VideoMAEImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def get_git_config(model_name): - if "base" in model_name and "vqa" in model_name: - image_size = 480 - elif "large" in model_name and "vqa" in model_name: - image_size = 420 - else: - image_size = 224 - - vision_config = GitVisionConfig(image_size=image_size) - - if "large" in model_name: - vision_config.patch_size = 14 - vision_config.hidden_size = 1024 - vision_config.intermediate_size = 4096 - vision_config.num_hidden_layers = 24 - vision_config.num_attention_heads = 16 - - is_video = "vatex" in model_name or "msrvtt" in model_name - num_image_with_embedding = 6 if is_video else None - config = GitConfig(vision_config=vision_config.to_dict(), num_image_with_embedding=num_image_with_embedding) - - return config, image_size, is_video - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config, prefix=""): - rename_keys = [] - - # image encoder - # ftm: off - rename_keys.append( - (f"{prefix}image_encoder.class_embedding", "git.image_encoder.vision_model.embeddings.class_embedding") - ) - rename_keys.append( - ( - f"{prefix}image_encoder.positional_embedding", - "git.image_encoder.vision_model.embeddings.position_embedding.weight", - ) - ) - rename_keys.append( - (f"{prefix}image_encoder.conv1.weight", "git.image_encoder.vision_model.embeddings.patch_embedding.weight") - ) - rename_keys.append((f"{prefix}image_encoder.ln_pre.weight", "git.image_encoder.vision_model.pre_layrnorm.weight")) - rename_keys.append((f"{prefix}image_encoder.ln_pre.bias", "git.image_encoder.vision_model.pre_layrnorm.bias")) - rename_keys.append( - (f"{prefix}image_encoder.ln_post.weight", "git.image_encoder.vision_model.post_layernorm.weight") - ) - rename_keys.append((f"{prefix}image_encoder.ln_post.bias", "git.image_encoder.vision_model.post_layernorm.bias")) - # fmt: on - rename_keys.append((f"{prefix}image_encoder.proj", "git.image_encoder.visual_projection.weight")) - - # fmt: off - for i in range(config.vision_config.num_hidden_layers): - # image encoder layers: output projection, 2 feedforward neural networks and 2 layernorms - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.attn.out_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.out_proj.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_1.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_fc.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.mlp.c_proj.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.mlp.fc2.bias")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.weight", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"{prefix}image_encoder.transformer.resblocks.{i}.ln_2.bias", f"git.image_encoder.vision_model.encoder.layers.{i}.layer_norm2.bias")) - # fmt: on - - # text decoder - # fmt: off - rename_keys.append((f"{prefix}textual.embedding.words.weight", "git.embeddings.word_embeddings.weight")) - rename_keys.append((f"{prefix}textual.embedding.positions.weight", "git.embeddings.position_embeddings.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.0.weight", "git.visual_projection.visual_projection.0.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.0.bias", "git.visual_projection.visual_projection.0.bias")) - rename_keys.append((f"{prefix}textual.visual_projection.1.weight", "git.visual_projection.visual_projection.1.weight")) - rename_keys.append((f"{prefix}textual.visual_projection.1.bias", "git.visual_projection.visual_projection.1.bias")) - - rename_keys.append((f"{prefix}textual.embedding.layer_norm.weight", "git.embeddings.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.embedding.layer_norm.bias", "git.embeddings.LayerNorm.bias")) - rename_keys.append((f"{prefix}textual.output.weight", "output.weight")) - rename_keys.append((f"{prefix}textual.output.bias", "output.bias")) - for i in range(config.num_hidden_layers): - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.weight", f"git.encoder.layer.{i}.attention.self.query.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.query.bias", f"git.encoder.layer.{i}.attention.self.query.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.weight", f"git.encoder.layer.{i}.attention.self.key.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.key.bias", f"git.encoder.layer.{i}.attention.self.key.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.weight", f"git.encoder.layer.{i}.attention.self.value.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.self.value.bias", f"git.encoder.layer.{i}.attention.self.value.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.weight", f"git.encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.dense.bias", f"git.encoder.layer.{i}.attention.output.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.weight", f"git.encoder.layer.{i}.attention.output.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.attention.output.LayerNorm.bias", f"git.encoder.layer.{i}.attention.output.LayerNorm.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.weight", f"git.encoder.layer.{i}.intermediate.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.intermediate.dense.bias", f"git.encoder.layer.{i}.intermediate.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.weight", f"git.encoder.layer.{i}.output.dense.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.dense.bias", f"git.encoder.layer.{i}.output.dense.bias")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.weight", f"git.encoder.layer.{i}.output.LayerNorm.weight")) - rename_keys.append((f"{prefix}textual.transformer.encoder.layer.{i}.output.LayerNorm.bias", f"git.encoder.layer.{i}.output.LayerNorm.bias")) - # fmt: on - - if config.num_image_with_embedding is not None: - rename_keys.append(("img_temperal_embedding.0", "git.img_temperal_embedding.0")) - rename_keys.append(("img_temperal_embedding.1", "git.img_temperal_embedding.1")) - rename_keys.append(("img_temperal_embedding.2", "git.img_temperal_embedding.2")) - rename_keys.append(("img_temperal_embedding.3", "git.img_temperal_embedding.3")) - rename_keys.append(("img_temperal_embedding.4", "git.img_temperal_embedding.4")) - rename_keys.append(("img_temperal_embedding.5", "git.img_temperal_embedding.5")) - - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val.T if "image_encoder.visual_projection" in new else val - - -# we split up the matrix of each CLIP encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config, prefix=""): - dim = config.vision_config.hidden_size - for i in range(config.vision_config.num_hidden_layers): - # read in weights + bias of input projection layer (in the original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"{prefix}image_encoder.transformer.resblocks.{i}.attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[ - :dim, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:dim] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[ - dim : dim * 2, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[ - dim : dim * 2 - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[ - -dim:, : - ] - state_dict[f"git.image_encoder.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-dim:] - - -# We will verify our results on an image -def prepare_img(model_name): - if "textvqa" in model_name: - filepath = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") - image = Image.open(filepath).convert("RGB") - else: - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -def prepare_video(): - def read_video_pyav(container, indices): - """ - Decode the video with PyAV decoder. - - Args: - container (`av.container.input.InputContainer`): PyAV container. - indices (`List[int]`): List of frame indices to decode. - - Returns: - result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). - """ - frames = [] - container.seek(0) - start_index = indices[0] - end_index = indices[-1] - for i, frame in enumerate(container.decode(video=0)): - if i > end_index: - break - if i >= start_index and i in indices: - frames.append(frame) - return np.stack([x.to_ndarray(format="rgb24") for x in frames]) - - def sample_frame_indices(clip_len, frame_sample_rate, seg_len): - """ - Sample a given number of frame indices from the video. - - Args: - clip_len (`int`): Total number of frames to sample. - frame_sample_rate (`int`): Sample every n-th frame. - seg_len (`int`): Maximum allowed index of sample's last frame. - - Returns: - indices (`List[int]`): List of sampled frame indices - """ - converted_len = int(clip_len * frame_sample_rate) - end_idx = np.random.randint(converted_len, seg_len) - start_idx = end_idx - converted_len - indices = np.linspace(start_idx, end_idx, num=clip_len) - indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) - return indices - - # set seed for reproducibility - np.random.seed(0) - - file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset") - with av.open(file_path) as container: - # sample 6 frames - num_frames = 6 - indices = sample_frame_indices( - clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames - ) - frames = read_video_pyav(container, indices) - - return frames - - -@torch.no_grad() -def convert_git_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our GIT structure. - """ - - model_name_to_url = { - "git-base": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE/snapshot/model.pt", - "git-base-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_COCO/snapshot/model.pt", - "git-base-textcaps": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTCAPS/snapshot/model.pt", - "git-base-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VQAv2/snapshot/model.pt", - "git-base-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_TEXTVQA/snapshot/model.pt", # todo - "git-base-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_VATEX/snapshot/model.pt", - "git-base-msrvtt-qa": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_BASE_MSRVTT_QA/snapshot/model.pt" - ), - "git-large": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE/snapshot/model.pt", - "git-large-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_COCO/snapshot/model.pt", - "git-large-textcaps": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTCAPS/snapshot/model.pt" - ), - "git-large-vqav2": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VQAv2/snapshot/model.pt", - "git-large-textvqa": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_TEXTVQA/snapshot/model.pt", - "git-large-vatex": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_VATEX/snapshot/model.pt", - "git-large-msrvtt-qa": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_MSRVTT_QA/snapshot/model.pt" - ), - "git-large-r": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R/snapshot/model.pt", - "git-large-r-coco": "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_COCO/snapshot/model.pt", - "git-large-r-textcaps": ( - "https://publicgit.blob.core.windows.net/data/output/GIT_LARGE_R_TEXTCAPS/snapshot/model.pt" - ), - } - - model_name_to_path = { - "git-large": "/Users/nielsrogge/Documents/GIT/git_large_model.pt", - "git-large-coco": "/Users/nielsrogge/Documents/GIT/git_large_coco_model.pt", - "git-large-textcaps": "/Users/nielsrogge/Documents/GIT/git_large_textcaps_model.pt", - "git-large-vqav2": "/Users/nielsrogge/Documents/GIT/git_large_vqav2_model.pt", - "git-large-textvqa": "/Users/nielsrogge/Documents/GIT/git_large_textvqa_model.pt", - } - - # define GIT configuration based on model name - config, image_size, is_video = get_git_config(model_name) - if "large" in model_name and not is_video and "large-r" not in model_name: - # large checkpoints take way too long to download - checkpoint_path = model_name_to_path[model_name] - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - else: - checkpoint_url = model_name_to_url[model_name] - state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", file_name=model_name)[ - "model" - ] - # rename keys - prefix = "module." if model_name == "git-base" else "" - rename_keys = create_rename_keys(config, prefix=prefix) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - read_in_q_k_v(state_dict, config, prefix=prefix) - - # load HuggingFace model - model = GitForCausalLM(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - model.eval() - - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - assert missing_keys == ["git.embeddings.position_ids", "git.image_encoder.vision_model.embeddings.position_ids"] - assert unexpected_keys == ["git.image_encoder.visual_projection.weight"] - - # verify results - image_processor = ( - VideoMAEImageProcessor( - size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} - ) - if is_video - else CLIPImageProcessor( - size={"shortest_edge": image_size}, crop_size={"height": image_size, "width": image_size} - ) - ) - tokenizer = AutoTokenizer.from_pretrained( - "google-bert/bert-base-uncased", model_input_names=["input_ids", "attention_mask"] - ) - processor = GitProcessor(tokenizer=tokenizer, image_processor=image_processor) - - if is_video: - video = prepare_video() - pixel_values = processor(images=list(video), return_tensors="pt").pixel_values - else: - image = prepare_img(model_name) - image_transforms = Compose( - [ - Resize(image_size, interpolation=Image.BICUBIC), - CenterCrop(image_size), - ToTensor(), - Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), - ] - ) - original_pixel_values = image_transforms(image).unsqueeze(0) - pixel_values = processor(images=image, return_tensors="pt").pixel_values - - assert torch.allclose(pixel_values, original_pixel_values) - - input_ids = torch.tensor([[101]]) - outputs = model(input_ids, pixel_values=pixel_values) - logits = outputs.logits - print("Logits:", logits[0, -1, :3]) - - if model_name == "git-base": - expected_slice_logits = torch.tensor([-1.2832, -1.2835, -1.2840]) - elif model_name == "git-base-coco": - expected_slice_logits = torch.tensor([-0.9925, -0.9930, -0.9935]) - elif model_name == "git-base-textcaps": - expected_slice_logits = torch.tensor([-1.2980, -1.2983, -1.2985]) - elif model_name == "git-base-vqav2": - expected_slice_logits = torch.tensor([-0.8570, -0.8568, -0.8561]) - elif model_name == "git-base-textvqa": - expected_slice_logits = torch.tensor([-1.4085, -1.4083, -1.4082]) - elif model_name == "git-base-vatex": - expected_slice_logits = torch.tensor([-1.3451, -1.3447, -1.3447]) - elif model_name == "git-base-msrvtt-qa": - expected_slice_logits = torch.tensor([-0.8554, -0.8550, -0.8540]) - elif model_name == "git-large": - expected_slice_logits = torch.tensor([-1.1708, -1.1707, -1.1705]) - elif model_name == "git-large-coco": - expected_slice_logits = torch.tensor([-1.0425, -1.0423, -1.0422]) - elif model_name == "git-large-textcaps": - expected_slice_logits = torch.tensor([-1.2705, -1.2708, -1.2706]) - elif model_name == "git-large-vqav2": - expected_slice_logits = torch.tensor([-0.7042, -0.7043, -0.7043]) - elif model_name == "git-large-textvqa": - expected_slice_logits = torch.tensor([-0.8590, -0.8592, -0.8590]) - elif model_name == "git-large-vatex": - expected_slice_logits = torch.tensor([-1.0113, -1.0114, -1.0113]) - elif model_name == "git-large-msrvtt-qa": - expected_slice_logits = torch.tensor([0.0130, 0.0134, 0.0131]) - elif model_name == "git-large-r": - expected_slice_logits = torch.tensor([-1.1283, -1.1285, -1.1286]) - elif model_name == "git-large-r-coco": - expected_slice_logits = torch.tensor([-0.9641, -0.9641, -0.9641]) - elif model_name == "git-large-r-textcaps": - expected_slice_logits = torch.tensor([-1.1121, -1.1120, -1.1124]) - - assert torch.allclose(logits[0, -1, :3], expected_slice_logits, atol=1e-4) - print("Looks ok!") - - prompt = "" - if "textvqa" in model_name: - prompt = "what does the front of the bus say at the top?" - elif "msrvtt-qa" in model_name: - prompt = "what does the woman eat?" - elif "vqa" in model_name: - prompt = "what are the cats doing?" - input_ids = tokenizer(prompt, add_special_tokens=False).input_ids - input_ids = [processor.tokenizer.cls_token_id] + input_ids - input_ids = torch.tensor(input_ids).unsqueeze(0) - print("Generating caption...") - generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) - print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True)) - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - print(f"Pushing model and processor of {model_name} to the hub...") - model.push_to_hub(f"microsoft/{model_name}") - processor.push_to_hub(f"microsoft/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="git-base", - type=str, - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model to the hub.", - ) - - args = parser.parse_args() - convert_git_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py deleted file mode 100644 index df1fd7537f..0000000000 --- a/src/transformers/models/glm/convert_glm_weights_to_hf.py +++ /dev/null @@ -1,195 +0,0 @@ -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file -from tokenizers import processors - -from transformers import GlmConfig, GlmForCausalLM, PreTrainedTokenizerFast - - -# fmt: off -# `None` means we drop the key -STATE_DICT_MAPPING = { - # CausalLM keys - r"transformer.output_layer.weight": r"lm_head.weight", - - # Model keys - r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight", - r"transformer.rotary_pos_emb.inv_freq": None, - r"transformer.encoder.final_layernorm.weight": r"model.norm.weight", - - # Layers keys - r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight", - r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight", - - # Attention keys - r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight", - # qkv_proj will later be split in q|k|v|_proj - r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2", - - # MLP keys - r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight", - r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight", -} -# fmt: on - - -def load_weights(input_dir: str): - safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] - bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")] - - all_weights = {} - - if safetensor_files: - safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in safetensor_files: - tensors = load_file(file) - all_weights.update(tensors) - return all_weights - - elif bin_files: - bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in bin_files: - tensors = torch.load(file, map_location="cpu", weights_only=True) - all_weights.update(tensors) - return all_weights - - else: - raise ValueError("No .safetensors or .bin files found in the specified directory.") - - -def map_old_key_to_new(old_key): - for pattern, replacement in STATE_DICT_MAPPING.items(): - if replacement is None: - if re.fullmatch(pattern, old_key): - return None - else: - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def convert_state_dict(original_state_dict: dict, config: GlmConfig): - new_dict = {} - - head_dim = config.hidden_size // config.num_attention_heads - query_size = config.num_attention_heads * head_dim - kv_size = config.num_key_value_heads * head_dim - - for old_key, value in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - if new_key is None: - continue - - if "qkv_proj." in new_key: - q_proj, k_proj, v_proj = ( - value[:query_size, ...], - value[query_size : query_size + kv_size, ...], - value[query_size + kv_size :, ...], - ) - new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj - new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj - new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj - else: - new_dict[new_key] = value - return new_dict - - -def convert_config(original_config: dict): - key_mapping = { - "vocab_size": "padded_vocab_size", - "intermediate_size": "ffn_hidden_size", - "num_hidden_layers": "num_layers", - "max_position_embeddings": "seq_length", - "rms_norm_eps": "layernorm_epsilon", - "head_dim": "kv_channels", - "attention_bias": "add_qkv_bias", - } - similar_keys_to_keep = [ - "num_attention_heads", - "hidden_size", - "attention_dropout", - "use_cache", - "eos_token_id", - "pad_token_id", - "tie_word_embeddings", - ] - new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()} - new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep}) - new_config_kwargs["num_key_value_heads"] = ( - new_config_kwargs["num_attention_heads"] - if not original_config["multi_query_attention"] - else original_config["multi_query_group_num"] - ) - new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1) - - new_config = GlmConfig(**new_config_kwargs) - return new_config - - -def convert_glm_tokenizer(input_dir, use_post_processor=False): - fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"]) - if use_post_processor: - fast_tok._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="[gMASK]:0 :0 $A:0", - pair="[gMASK]:0 :0 $A:0 $B:1", - special_tokens=[("[gMASK]", 151331), ("", 151333)], - ), - ], - ) - else: - fast_tok._tokenizer.post_processor = processors.Sequence( - [processors.ByteLevel(trim_offsets=False)], - ) - return fast_tok - - -def convert_glm_model(input_dir, output_dir, use_post_processor=False): - # Load and convert config - with open(os.path.join(input_dir, "config.json")) as f: - original_config = json.load(f) - config = convert_config(original_config) - config.save_pretrained(output_dir) - - # Load and convert weights - original_state_dict = load_weights(input_dir) - new_dict = convert_state_dict(original_state_dict, config) - with torch.device("meta"): - model = GlmForCausalLM(config) - model.load_state_dict(new_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - # Load and convert tokenizer - tokenizer = convert_glm_tokenizer(input_dir, use_post_processor) - tokenizer.save_pretrained(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "input_dir", - type=str, - help="Location of the local folder copied from the Hub.", - ) - parser.add_argument( - "output_dir", - type=str, - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--use_post_processor", - action="store_true", - help="Whether to apply post processor with special tokens", - ) - - args = parser.parse_args() - convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor) diff --git a/src/transformers/models/glm4/convert_glm4_weights_to_hf.py b/src/transformers/models/glm4/convert_glm4_weights_to_hf.py deleted file mode 100644 index 01ad00f517..0000000000 --- a/src/transformers/models/glm4/convert_glm4_weights_to_hf.py +++ /dev/null @@ -1,199 +0,0 @@ -import argparse -import json -import os -import re - -import torch -from safetensors.torch import load_file -from tokenizers import processors - -from transformers import Glm4Config, Glm4ForCausalLM, PreTrainedTokenizerFast - - -# fmt: off -# `None` means we drop the key -STATE_DICT_MAPPING = { - # CausalLM keys - r"transformer.output_layer.weight": r"lm_head.weight", - - # Model keys - r"transformer.embedding.word_embeddings.weight": r"model.embed_tokens.weight", - r"transformer.rotary_pos_emb.inv_freq": None, - r"transformer.encoder.final_layernorm.weight": r"model.norm.weight", - - # Layers keys - r"transformer.encoder.layers.(\d+).input_layernorm.weight": r"model.layers.\1.input_layernorm.weight", - - # Sandwich keys - r"transformer.encoder.layers.(\d+).post_mlp_layernorm.weight": r"model.layers.\1.post_mlp_layernorm.weight", - r"transformer.encoder.layers.(\d+).post_self_attn_layernorm.weight": r"model.layers.\1.post_self_attn_layernorm.weight", - - r"transformer.encoder.layers.(\d+).post_attention_layernorm.weight": r"model.layers.\1.post_attention_layernorm.weight", - - # Attention keys - r"transformer.encoder.layers.(\d+).self_attention.dense.weight": r"model.layers.\1.self_attn.o_proj.weight", - # qkv_proj will later be split in q|k|v|_proj - r"transformer.encoder.layers.(\d+).self_attention.query_key_value.(weight|bias)": r"model.layers.\1.self_attn.qkv_proj.\2", - - # MLP keys - r"transformer.encoder.layers.(\d+).mlp.dense_h_to_4h.weight": r"model.layers.\1.mlp.gate_up_proj.weight", - r"transformer.encoder.layers.(\d+).mlp.dense_4h_to_h.weight": r"model.layers.\1.mlp.down_proj.weight", -} -# fmt: on - - -def load_weights(input_dir: str): - safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] - bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")] - - all_weights = {} - - if safetensor_files: - safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in safetensor_files: - tensors = load_file(file) - all_weights.update(tensors) - return all_weights - - elif bin_files: - bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1])) - for file in bin_files: - tensors = torch.load(file, map_location="cpu") - all_weights.update(tensors) - return all_weights - - else: - raise ValueError("No .safetensors or .bin files found in the specified directory.") - - -def map_old_key_to_new(old_key): - for pattern, replacement in STATE_DICT_MAPPING.items(): - if replacement is None: - if re.fullmatch(pattern, old_key): - return None - else: - new_key, n_replace = re.subn(pattern, replacement, old_key) - # Early exit of the loop - if n_replace > 0: - return new_key - - raise ValueError(f"Key: {old_key} could not be mapped (check the mapping).") - - -def convert_state_dict(original_state_dict: dict, config: Glm4Config): - new_dict = {} - - head_dim = config.hidden_size // config.num_attention_heads - query_size = config.num_attention_heads * head_dim - kv_size = config.num_key_value_heads * head_dim - - for old_key, value in original_state_dict.items(): - new_key = map_old_key_to_new(old_key) - if new_key is None: - continue - - if "qkv_proj." in new_key: - q_proj, k_proj, v_proj = ( - value[:query_size, ...], - value[query_size : query_size + kv_size, ...], - value[query_size + kv_size :, ...], - ) - new_dict[new_key.replace("qkv_proj.", "q_proj.")] = q_proj - new_dict[new_key.replace("qkv_proj.", "k_proj.")] = k_proj - new_dict[new_key.replace("qkv_proj.", "v_proj.")] = v_proj - else: - new_dict[new_key] = value - return new_dict - - -def convert_config(original_config: dict): - key_mapping = { - "vocab_size": "padded_vocab_size", - "intermediate_size": "ffn_hidden_size", - "num_hidden_layers": "num_layers", - "max_position_embeddings": "seq_length", - "rms_norm_eps": "layernorm_epsilon", - "head_dim": "kv_channels", - "attention_bias": "add_qkv_bias", - } - similar_keys_to_keep = [ - "num_attention_heads", - "hidden_size", - "attention_dropout", - "use_cache", - "eos_token_id", - "pad_token_id", - "tie_word_embeddings", - ] - new_config_kwargs = {k: original_config[v] for k, v in key_mapping.items()} - new_config_kwargs.update({k: v for k, v in original_config.items() if k in similar_keys_to_keep}) - new_config_kwargs["num_key_value_heads"] = ( - new_config_kwargs["num_attention_heads"] - if not original_config["multi_query_attention"] - else original_config["multi_query_group_num"] - ) - new_config_kwargs["rope_theta"] = 10000.0 * getattr(original_config, "rope_ratio", 1) - - new_config = Glm4Config(**new_config_kwargs) - return new_config - - -def convert_glm4_tokenizer(input_dir, use_post_processor=False): - fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"]) - if use_post_processor: - fast_tok._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="[gMASK]:0 :0 $A:0", - pair="[gMASK]:0 :0 $A:0 $B:1", - special_tokens=[("[gMASK]", 151331), ("", 151333)], - ), - ], - ) - else: - fast_tok._tokenizer.post_processor = processors.Sequence( - [processors.ByteLevel(trim_offsets=False)], - ) - return fast_tok - - -def convert_glm4_model(input_dir, output_dir, use_post_processor=False): - # Load and convert config - with open(os.path.join(input_dir, "config.json")) as f: - original_config = json.load(f) - config = convert_config(original_config) - config.save_pretrained(output_dir) - - # Load and convert weights - original_state_dict = load_weights(input_dir) - new_dict = convert_state_dict(original_state_dict, config) - with torch.device("meta"): - model = Glm4ForCausalLM(config) - model.load_state_dict(new_dict, strict=True, assign=True) - model.save_pretrained(output_dir) - - # Load and convert tokenizer - tokenizer = convert_glm4_tokenizer(input_dir, use_post_processor) - tokenizer.save_pretrained(output_dir) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "input_dir", - type=str, - help="Location of the local folder copied from the Hub.", - ) - parser.add_argument( - "output_dir", - type=str, - help="Location to write HF model and tokenizer", - ) - parser.add_argument( - "--use_post_processor", - action="store_true", - help="Whether to apply post processor with special tokens", - ) - args = parser.parse_args() - convert_glm4_model(args.input_dir, args.output_dir, args.use_post_processor) diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py deleted file mode 100644 index 51088fb724..0000000000 --- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py +++ /dev/null @@ -1,218 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GLPN checkpoints.""" - -import argparse -from collections import OrderedDict -from pathlib import Path - -import requests -import torch -from PIL import Image - -from transformers import GLPNConfig, GLPNForDepthEstimation, GLPNImageProcessor -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def rename_keys(state_dict): - new_state_dict = OrderedDict() - for key, value in state_dict.items(): - if key.startswith("module.encoder"): - key = key.replace("module.encoder", "glpn.encoder") - if key.startswith("module.decoder"): - key = key.replace("module.decoder", "decoder.stages") - if "patch_embed" in key: - # replace for example patch_embed1 by patch_embeddings.0 - idx = key[key.find("patch_embed") + len("patch_embed")] - key = key.replace(f"patch_embed{idx}", f"patch_embeddings.{int(idx) - 1}") - if "norm" in key: - key = key.replace("norm", "layer_norm") - if "glpn.encoder.layer_norm" in key: - # replace for example layer_norm1 by layer_norm.0 - idx = key[key.find("glpn.encoder.layer_norm") + len("glpn.encoder.layer_norm")] - key = key.replace(f"layer_norm{idx}", f"layer_norm.{int(idx) - 1}") - if "layer_norm1" in key: - key = key.replace("layer_norm1", "layer_norm_1") - if "layer_norm2" in key: - key = key.replace("layer_norm2", "layer_norm_2") - if "block" in key: - # replace for example block1 by block.0 - idx = key[key.find("block") + len("block")] - key = key.replace(f"block{idx}", f"block.{int(idx) - 1}") - if "attn.q" in key: - key = key.replace("attn.q", "attention.self.query") - if "attn.proj" in key: - key = key.replace("attn.proj", "attention.output.dense") - if "attn" in key: - key = key.replace("attn", "attention.self") - if "fc1" in key: - key = key.replace("fc1", "dense1") - if "fc2" in key: - key = key.replace("fc2", "dense2") - if "linear_pred" in key: - key = key.replace("linear_pred", "classifier") - if "linear_fuse" in key: - key = key.replace("linear_fuse.conv", "linear_fuse") - key = key.replace("linear_fuse.bn", "batch_norm") - if "linear_c" in key: - # replace for example linear_c4 by linear_c.3 - idx = key[key.find("linear_c") + len("linear_c")] - key = key.replace(f"linear_c{idx}", f"linear_c.{int(idx) - 1}") - if "bot_conv" in key: - key = key.replace("bot_conv", "0.convolution") - if "skip_conv1" in key: - key = key.replace("skip_conv1", "1.convolution") - if "skip_conv2" in key: - key = key.replace("skip_conv2", "2.convolution") - if "fusion1" in key: - key = key.replace("fusion1", "1.fusion") - if "fusion2" in key: - key = key.replace("fusion2", "2.fusion") - if "fusion3" in key: - key = key.replace("fusion3", "3.fusion") - if "fusion" in key and "conv" in key: - key = key.replace("conv", "convolutional_layer") - if key.startswith("module.last_layer_depth"): - key = key.replace("module.last_layer_depth", "head.head") - new_state_dict[key] = value - - return new_state_dict - - -def read_in_k_v(state_dict, config): - # for each of the encoder blocks: - for i in range(config.num_encoder_blocks): - for j in range(config.depths[i]): - # read in weights + bias of keys and values (which is a single matrix in the original implementation) - kv_weight = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.weight") - kv_bias = state_dict.pop(f"glpn.encoder.block.{i}.{j}.attention.self.kv.bias") - # next, add keys and values (in that order) to the state dict - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.weight"] = kv_weight[ - : config.hidden_sizes[i], : - ] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.key.bias"] = kv_bias[: config.hidden_sizes[i]] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.weight"] = kv_weight[ - config.hidden_sizes[i] :, : - ] - state_dict[f"glpn.encoder.block.{i}.{j}.attention.self.value.bias"] = kv_bias[config.hidden_sizes[i] :] - - -# We will verify our results on a COCO image -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw) - - return image - - -@torch.no_grad() -def convert_glpn_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub=False, model_name=None): - """ - Copy/paste/tweak model's weights to our GLPN structure. - """ - - # load GLPN configuration (Segformer-B4 size) - config = GLPNConfig(hidden_sizes=[64, 128, 320, 512], decoder_hidden_size=64, depths=[3, 8, 27, 3]) - - # load image processor (only resize + rescale) - image_processor = GLPNImageProcessor() - - # prepare image - image = prepare_img() - pixel_values = image_processor(images=image, return_tensors="pt").pixel_values - - logger.info("Converting model...") - - # load original state dict - state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=True) - - # rename keys - state_dict = rename_keys(state_dict) - - # key and value matrices need special treatment - read_in_k_v(state_dict, config) - - # create HuggingFace model and load state dict - model = GLPNForDepthEstimation(config) - model.load_state_dict(state_dict) - model.eval() - - # forward pass - outputs = model(pixel_values) - predicted_depth = outputs.predicted_depth - - # verify output - if model_name is not None: - if "nyu" in model_name: - expected_slice = torch.tensor( - [[4.4147, 4.0873, 4.0673], [3.7890, 3.2881, 3.1525], [3.7674, 3.5423, 3.4913]] - ) - elif "kitti" in model_name: - expected_slice = torch.tensor( - [[3.4291, 2.7865, 2.5151], [3.2841, 2.7021, 2.3502], [3.1147, 2.4625, 2.2481]] - ) - else: - raise ValueError(f"Unknown model name: {model_name}") - - expected_shape = torch.Size([1, 480, 640]) - - assert predicted_depth.shape == expected_shape - assert torch.allclose(predicted_depth[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - # finally, push to hub if required - if push_to_hub: - logger.info("Pushing model and image processor to the hub...") - model.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add model", - use_temp_dir=True, - ) - image_processor.push_to_hub( - repo_path_or_name=Path(pytorch_dump_folder_path, model_name), - organization="nielsr", - commit_message="Add image processor", - use_temp_dir=True, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint_path", - default=None, - type=str, - help="Path to the original PyTorch checkpoint (.pth file).", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether to upload the model to the HuggingFace hub." - ) - parser.add_argument( - "--model_name", - default="glpn-kitti", - type=str, - help="Name of the model in case you're pushing to the hub.", - ) - args = parser.parse_args() - convert_glpn_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub, args.model_name) diff --git a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py b/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py deleted file mode 100644 index 2bf4b3ac7d..0000000000 --- a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py +++ /dev/null @@ -1,274 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import gc -import glob -import os -from typing import List, Optional - -import regex as re -import torch -from huggingface_hub import snapshot_download -from safetensors import safe_open - -from transformers import ( - GotOcr2Config, - GotOcr2ForConditionalGeneration, - GotOcr2ImageProcessor, - GotOcr2Processor, - PreTrainedTokenizerFast, - is_vision_available, -) -from transformers.convert_slow_tokenizer import TikTokenConverter -from transformers.tokenization_utils import AddedToken - - -if is_vision_available(): - from transformers.image_utils import load_image - - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Vision encoder mapping - r"model.vision_tower_high.pos_embed": r"vision_tower.pos_embed", - r"model.vision_tower_high.patch_embed.proj": r"vision_tower.patch_embed.projection", - r"model.vision_tower_high.blocks.(\d+).norm": r"vision_tower.layers.\1.layer_norm", - r"model.vision_tower_high.blocks.(\d+).attn": r"vision_tower.layers.\1.attn", - r"model.vision_tower_high.blocks.(\d+).mlp": r"vision_tower.layers.\1.mlp", - r"model.vision_tower_high.neck.0": r"vision_tower.neck.conv1", - r"model.vision_tower_high.neck.1": r"vision_tower.neck.layer_norm1", - r"model.vision_tower_high.neck.2": r"vision_tower.neck.conv2", - r"model.vision_tower_high.neck.3": r"vision_tower.neck.layer_norm2", - r"model.vision_tower_high.net_(\d+)": lambda m: f"multi_modal_projector.conv_upsampler{int(m.group(1)) - 1}", - r"model.mm_projector_vary" : r"multi_modal_projector.multimodal_projector", - r"model.": r"language_model.model.", - r"lm_head": r"language_model.lm_head", -} -# fmt: on - -CONTEXT_LENGTH = 8000 - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - This function should be applied only once, on the concatenated keys to efficiently rename using - the key mappings. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - new_text = re.sub(pattern, replacement, new_text) - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - return output_dict - - -def load_original_state_dict(model_id): - directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) - - original_state_dict = {} - for path in glob.glob(f"{directory_path}/*"): - if path.endswith(".safetensors"): - with safe_open(path, framework="pt", device="cpu") as f: - for key in f.keys(): - original_state_dict[key] = f.get_tensor(key) - - return original_state_dict - - -def get_got_ocr2_config(): - config = GotOcr2Config() - - return config - - -def write_model( - model_path, - input_base_path, - push_to_hub=False, -): - os.makedirs(model_path, exist_ok=True) - - config = get_got_ocr2_config() - config.architectures = ["GotOcr2ForConditionalGeneration"] - config.save_pretrained(model_path) - print("Model config saved successfully...") - - # ------------------------------------------------------------ - # Convert weights - # ------------------------------------------------------------ - - print(f"Fetching all parameters from the checkpoint at {input_base_path}...") - state_dict_old = load_original_state_dict(input_base_path) - print("Converting model...") - all_keys = list(state_dict_old.keys()) - new_keys = convert_old_keys_to_new_keys(all_keys) - state_dict = {} - for key in all_keys: - new_key = new_keys[key] - state_dict[new_key] = state_dict_old[key] - - del state_dict_old - gc.collect() - - print("Loading the checkpoint in a GotOcr2ForConditionalGeneration model.") - model = GotOcr2ForConditionalGeneration(config) - missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) - model = model.to(torch.bfloat16) - print("model dtype:", model.dtype) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - print("Saving the model.") - model.save_pretrained(model_path) - if push_to_hub: - model.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True) - del state_dict, model - - # Safety check: reload the converted model - gc.collect() - print("Reloading the model to check if it's saved correctly.") - model = GotOcr2ForConditionalGeneration.from_pretrained(model_path, device_map="auto") - processor = GotOcr2Processor.from_pretrained(model_path) - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg" - ) - - inputs = processor(image, return_tensors="pt", format=True).to(model.device, dtype=model.dtype) - generate_ids = model.generate(**inputs, do_sample=False, num_beams=1, max_new_tokens=4) - decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True) - expected_output = "\\title{\nR" - print("Decoded output:", decoded_output) - assert decoded_output == expected_output - print("Model reloaded successfully.") - del model - - -class GotOcr2Converter(TikTokenConverter): - def __init__( - self, - vocab_file, - special_tokens: List[str], - pattern: str, - model_max_length: int, - chat_template: Optional[str] = None, - **kwargs, - ): - super().__init__(vocab_file, pattern=pattern) - self.additional_special_tokens = special_tokens - tokenizer = self.converted() - if chat_template is not None: - kwargs["chat_template"] = chat_template - self.tokenizer = PreTrainedTokenizerFast( - tokenizer_object=tokenizer, - model_input_names=["input_ids", "attention_mask"], - model_max_length=model_max_length, - **kwargs, - ) - - -def write_tokenizer(tokenizer_path: str, save_dir: str, push_to_hub: bool = False): - model_max_length = CONTEXT_LENGTH - pattern = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" # noqa: W605 - # Special tokens - special_tokens = ( - ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] - + [f"<|extra_{i}|>" for i in range(205)] - + [ - "", - "", - "", - "", - "", - "", - "", - "", - "", - ] - ) - - pad_token = "<|endoftext|>" - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, single_word=False) - - converter = GotOcr2Converter( - vocab_file=tokenizer_path, - pattern=pattern, - special_tokens=special_tokens, - model_max_length=model_max_length, - pad_token=pad_token, - bos_token="<|endoftext|>", - eos_token="<|endoftext|>", - clean_up_tokenization_spaces=True, - ) - tokenizer = converter.tokenizer - tokenizer.save_pretrained(save_dir) - - if push_to_hub: - tokenizer.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True) - - -def write_image_processor(save_dir: str, push_to_hub: bool = False): - image_processor = GotOcr2ImageProcessor( - do_resize=True, - size={"height": 1024, "width": 1024}, - do_rescale=True, - rescale_factor=1 / 255, - do_normalize=True, - image_mean=[0.48145466, 0.4578275, 0.40821073], - image_std=[0.26862954, 0.26130258, 0.27577711], - ) - - image_processor.save_pretrained(save_dir) - if push_to_hub: - image_processor.push_to_hub("stepfun-ai/GOT-OCR-2.0-hf", use_temp_dir=True) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--input_dir", - default="stepfun-ai/GOT-OCR2_0", - help="Location of LLaMA weights, which contains tokenizer.model and model folders", - ) - parser.add_argument( - "--output_dir", - default="GotOcr2", - help="Location to write HF model and tokenizer", - ) - - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - args = parser.parse_args() - write_tokenizer( - tokenizer_path="qwen.tiktoken", - save_dir=args.output_dir, - push_to_hub=args.push_to_hub, - ) - - write_image_processor( - save_dir=args.output_dir, - push_to_hub=args.push_to_hub, - ) - write_model( - model_path=args.output_dir, - input_base_path=args.input_dir, - push_to_hub=args.push_to_hub, - ) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index 33f9dabed0..0000000000 --- a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert OpenAI GPT checkpoint.""" - -import argparse - -import torch - -from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2 -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - - -def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): - # Construct model - if gpt2_config_file == "": - config = GPT2Config() - else: - config = GPT2Config.from_json_file(gpt2_config_file) - model = GPT2Model(config) - - # Load weights from numpy - load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) - - # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - parser.add_argument( - "--gpt2_config_file", - default="", - type=str, - help=( - "An optional config json file corresponding to the pre-trained OpenAI model. \n" - "This specifies the model architecture." - ), - ) - args = parser.parse_args() - convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, args.gpt2_config_file, args.pytorch_dump_folder_path) diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py deleted file mode 100644 index 3db2285729..0000000000 --- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Eleuther AI and HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GPT Neo checkpoint.""" - -import argparse -import json - -from transformers import GPTNeoConfig, GPTNeoForCausalLM, load_tf_weights_in_gpt_neo -from transformers.utils import logging - - -logging.set_verbosity_info() - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config_json = json.load(open(config_file, "r")) - config = GPTNeoConfig( - hidden_size=config_json["n_embd"], - num_layers=config_json["n_layer"], - num_heads=config_json["n_head"], - attention_types=config_json["attention_types"], - max_position_embeddings=config_json["n_positions"], - resid_dropout=config_json["res_dropout"], - embed_dropout=config_json["embed_dropout"], - attention_dropout=config_json["attn_dropout"], - ) - print(f"Building PyTorch model from configuration: {config}") - model = GPTNeoForCausalLM(config) - - # Load weights from tf checkpoint - load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path) - - # Save pytorch-model - print(f"Save PyTorch model to {pytorch_dump_path}") - model.save_pretrained(pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help=( - "The config json file corresponding to the pre-trained mesh-tf model. \n" - "This specifies the model architecture." - ), - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py deleted file mode 100644 index c4e2ff67c5..0000000000 --- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright 2022 The HuggingFace Inc. team and the AI-Sweden team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert GPT-SW3 megatron checkpoints to pytorch""" - -import argparse -import os -from os.path import isfile - -import torch - -from transformers import GPT2Config - - -def recursive_print(name, val, spaces=0): - # Format the message. - if name is None: - msg = None - else: - fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" - msg = fmt.format(name) - - # Print and recurse (if needed). - if isinstance(val, dict): - if msg is not None: - print(msg) - for k in val.keys(): - recursive_print(k, val[k], spaces + 2) - elif isinstance(val, torch.Tensor): - print(msg, ":", val.size()) - else: - print(msg, ":", val) - - -def fix_query_key_value_ordering(param, num_splits, num_heads, hidden_size): - # Permutes layout of param tensor to [num_splits * num_heads * hidden_size, :] - # for compatibility with later versions of NVIDIA Megatron-LM. - # The inverse operation is performed inside Megatron-LM to read checkpoints: - # https://github.com/NVIDIA/Megatron-LM/blob/v2.4/megatron/checkpointing.py#L209 - # If param is the weight tensor of the self-attention block, the returned tensor - # will have to be transposed one more time to be read by HuggingFace GPT2. - input_shape = param.size() - # other versions store [num_heads * num_splits * hidden_size, :] - saved_shape = (num_heads, num_splits, hidden_size) + input_shape[1:] - param = param.view(*saved_shape) - param = param.transpose(0, 1).contiguous() - param = param.view(*input_shape) - return param - - -def convert_megatron_checkpoint(sd_megatron, config): - """ - Converts a Megatron checkpoint to a HuggingFace GPT-SW3 checkpoint. - """ - n_positions = config.n_positions - layers = config.n_layer - vocab_size = config.vocab_size - heads = config.n_head - hidden_size_per_head = config.n_embd // config.n_head - - word_embeddings = sd_megatron["model.language_model.embedding.word_embeddings.weight"][:vocab_size, :] - sd_hf = { - "transformer.wte.weight": word_embeddings, - "transformer.wpe.weight": sd_megatron["model.language_model.embedding.position_embeddings.weight"], - "transformer.ln_f.weight": sd_megatron["model.language_model.encoder.final_layernorm.weight"], - "transformer.ln_f.bias": sd_megatron["model.language_model.encoder.final_layernorm.bias"], - } - - pf = "model.language_model.encoder.layers." - for i in range(layers): - causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.bool)) - causal_mask = causal_mask.view(1, 1, n_positions, n_positions) - sd_hf[f"transformer.h.{i}.attn.bias"] = causal_mask - sd_hf[f"transformer.h.{i}.attn.masked_bias"] = torch.tensor(-1e4, dtype=torch.bfloat16) - - sd_hf[f"transformer.h.{i}.ln_1.weight"] = sd_megatron[f"{pf}{i}.input_layernorm.weight"] - sd_hf[f"transformer.h.{i}.ln_1.bias"] = sd_megatron[f"{pf}{i}.input_layernorm.bias"] - - val1 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.weight"] - val1 = fix_query_key_value_ordering(val1, 3, heads, hidden_size_per_head) - sd_hf[f"transformer.h.{i}.attn.c_attn.weight"] = val1.transpose(0, 1).contiguous() - - val2 = sd_megatron[f"{pf}{i}.self_attention.query_key_value.bias"] - val2 = fix_query_key_value_ordering(val2, 3, heads, hidden_size_per_head) - sd_hf[f"transformer.h.{i}.attn.c_attn.bias"] = val2 - - sd_hf[f"transformer.h.{i}.attn.c_proj.weight"] = sd_megatron[f"{pf}{i}.self_attention.dense.weight"].transpose( - 0, 1 - ) - sd_hf[f"transformer.h.{i}.attn.c_proj.bias"] = sd_megatron[f"{pf}{i}.self_attention.dense.bias"] - sd_hf[f"transformer.h.{i}.ln_2.weight"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.weight"] - sd_hf[f"transformer.h.{i}.ln_2.bias"] = sd_megatron[f"{pf}{i}.post_attention_layernorm.bias"] - sd_hf[f"transformer.h.{i}.mlp.c_fc.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.weight"].transpose(0, 1) - sd_hf[f"transformer.h.{i}.mlp.c_fc.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_h_to_4h.bias"] - sd_hf[f"transformer.h.{i}.mlp.c_proj.weight"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.weight"].transpose( - 0, 1 - ) - sd_hf[f"transformer.h.{i}.mlp.c_proj.bias"] = sd_megatron[f"{pf}{i}.mlp.dense_4h_to_h.bias"] - - # For LM head, transformers' wants the matrix to weight embeddings. - sd_hf["lm_head.weight"] = word_embeddings - - return sd_hf - - -def copy_config(config_hf, config_megatron): - """Copy the config from Megatron to hf.""" - config_hf.vocab_size = 64000 - config_hf.n_positions = config_megatron["encoder_seq_length"] - config_hf.n_embd = config_megatron["hidden_size"] - config_hf.n_layer = config_megatron["num_layers"] - config_hf.n_head = config_megatron["num_attention_heads"] - config_hf.n_inner = config_megatron["ffn_hidden_size"] - config_hf.activation_function = "gelu" - config_hf.resid_pdrop = 0.1 - config_hf.embd_pdrop = 0.1 - config_hf.attn_pdrop = 0.1 - config_hf.layer_norm_epsilon = config_megatron["layernorm_epsilon"] # 1e-5 - config_hf.initializer_range = config_megatron["init_method_std"] # 0.02 - config_hf.apply_query_key_layer_scaling = config_megatron["apply_query_key_layer_scaling"] # True - config_hf.normalize_attention_scores = True - config_hf.use_cache = True - - # This identifies the 6.7B (7B) model which uses a different tokenizer - if config_megatron["hidden_size"] == 4096: - config_hf.bos_token_id = 1 # <|endoftext|> - config_hf.eos_token_id = 1 # <|endoftext|> - config_hf.pad_token_id = 0 # - else: - config_hf.bos_token_id = 2 # - config_hf.eos_token_id = 3 # <|endoftext|> - config_hf.pad_token_id = 0 # - - return config_hf - - -def main(args): - print(args) - - checkpoint_path = args.checkpoint_path - save_path = args.save_path - if isfile(checkpoint_path): - raise FileNotFoundError(f"ERROR! could not find file {checkpoint_path}") - - # Load the model. - checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - - # Load the config. - config_megatron = checkpoint["hyper_parameters"]["cfg"] - config_hf = GPT2Config() - config_hf = copy_config(config_hf=config_hf, config_megatron=config_megatron) - config_hf.architectures = ["GPT2LMHeadModel"] - - sd_megatron = checkpoint["state_dict"] - - # Convert. - print("Converting") - sd_hf = convert_megatron_checkpoint(sd_megatron, config_hf) - - # Print the structure of converted state dict. - if args.print_checkpoint_structure: - recursive_print(None, sd_hf) - - config_hf.tokenizer_class = "GPTSw3Tokenizer" - - # Store the config to file. - print("Saving config") - config_hf.save_pretrained(save_path) - - # Store the state_dict to file. - output_checkpoint_file = os.path.join(save_path, "pytorch_model.bin") - print(f'Saving checkpoint to "{output_checkpoint_file}"') - torch.save(sd_hf, output_checkpoint_file) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--checkpoint_path", - type=str, - required=True, - help="e.g. megatron_gpt--val_loss=2.42-step=38000-consumed_samples=54720000", - ) - parser.add_argument("--save_path", type=str, required=True, help="e.g. /home/user/gpt-sw3/hf") - parser.add_argument("--print-checkpoint-structure", action="store_true") - _args = parser.parse_args() - main(_args) diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py deleted file mode 100644 index 84998cfdef..0000000000 --- a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py +++ /dev/null @@ -1,491 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Grounding DINO checkpoints from the original repository. - -URL: https://github.com/IDEA-Research/GroundingDINO""" - -import argparse - -import requests -import torch -from PIL import Image -from torchvision import transforms as T - -from transformers import ( - AutoTokenizer, - GroundingDinoConfig, - GroundingDinoForObjectDetection, - GroundingDinoImageProcessor, - GroundingDinoProcessor, - SwinConfig, -) - - -IMAGENET_MEAN = [0.485, 0.456, 0.406] -IMAGENET_STD = [0.229, 0.224, 0.225] - - -def get_grounding_dino_config(model_name): - if "tiny" in model_name: - window_size = 7 - embed_dim = 96 - depths = (2, 2, 6, 2) - num_heads = (3, 6, 12, 24) - image_size = 224 - elif "base" in model_name: - window_size = 12 - embed_dim = 128 - depths = (2, 2, 18, 2) - num_heads = (4, 8, 16, 32) - image_size = 384 - else: - raise ValueError("Model not supported, only supports base and large variants") - - backbone_config = SwinConfig( - window_size=window_size, - image_size=image_size, - embed_dim=embed_dim, - depths=depths, - num_heads=num_heads, - out_indices=[2, 3, 4], - ) - - config = GroundingDinoConfig(backbone_config=backbone_config) - - return config - - -def create_rename_keys(state_dict, config): - rename_keys = [] - # fmt: off - ########################################## VISION BACKBONE - START - # patch embedding layer - rename_keys.append(("backbone.0.patch_embed.proj.weight", - "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("backbone.0.patch_embed.proj.bias", - "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) - rename_keys.append(("backbone.0.patch_embed.norm.weight", - "model.backbone.conv_encoder.model.embeddings.norm.weight")) - rename_keys.append(("backbone.0.patch_embed.norm.bias", - "model.backbone.conv_encoder.model.embeddings.norm.bias")) - - for layer, depth in enumerate(config.backbone_config.depths): - for block in range(depth): - # layernorms - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) - - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) - # attention - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) - # intermediate - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) - - # output - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) - - # downsample - if layer!=len(config.backbone_config.depths)-1: - rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) - rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) - - for out_indice in config.backbone_config.out_indices: - # Grounding DINO implementation of out_indices isn't aligned with transformers - rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", - f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) - rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", - f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) - - ########################################## VISION BACKBONE - END - - ########################################## ENCODER - START - deformable_key_mappings = { - 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', - 'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias', - 'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight', - 'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias', - 'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight', - 'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias', - 'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight', - 'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias', - 'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight', - 'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias', - 'linear1.weight': 'deformable_layer.fc1.weight', - 'linear1.bias': 'deformable_layer.fc1.bias', - 'linear2.weight': 'deformable_layer.fc2.weight', - 'linear2.bias': 'deformable_layer.fc2.bias', - 'norm2.weight': 'deformable_layer.final_layer_norm.weight', - 'norm2.bias': 'deformable_layer.final_layer_norm.bias', - } - text_enhancer_key_mappings = { - 'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight', - 'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias', - 'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight', - 'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias', - 'linear1.weight': 'text_enhancer_layer.fc1.weight', - 'linear1.bias': 'text_enhancer_layer.fc1.bias', - 'linear2.weight': 'text_enhancer_layer.fc2.weight', - 'linear2.bias': 'text_enhancer_layer.fc2.bias', - 'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight', - 'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias', - 'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight', - 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', - } - fusion_key_mappings = { - 'gamma_v': 'fusion_layer.vision_param', - 'gamma_l': 'fusion_layer.text_param', - 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', - 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', - 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', - 'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias', - 'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight', - 'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias', - 'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight', - 'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias', - 'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight', - 'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias', - 'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight', - 'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias', - 'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight', - 'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias', - 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', - 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', - } - for layer in range(config.encoder_layers): - # deformable - for src, dest in deformable_key_mappings.items(): - rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - # text enhance - for src, dest in text_enhancer_key_mappings.items(): - rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - # fusion layers - for src, dest in fusion_key_mappings.items(): - rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", - f"model.encoder.layers.{layer}.{dest}")) - ########################################## ENCODER - END - - ########################################## DECODER - START - key_mappings_decoder = { - 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', - 'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias', - 'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight', - 'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias', - 'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight', - 'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias', - 'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight', - 'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias', - 'norm1.weight': 'encoder_attn_layer_norm.weight', - 'norm1.bias': 'encoder_attn_layer_norm.bias', - 'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight', - 'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias', - 'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight', - 'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias', - 'catext_norm.weight': 'encoder_attn_text_layer_norm.weight', - 'catext_norm.bias': 'encoder_attn_text_layer_norm.bias', - 'self_attn.in_proj_weight': 'self_attn.in_proj_weight', - 'self_attn.in_proj_bias': 'self_attn.in_proj_bias', - 'self_attn.out_proj.weight': 'self_attn.out_proj.weight', - 'self_attn.out_proj.bias': 'self_attn.out_proj.bias', - 'norm2.weight': 'self_attn_layer_norm.weight', - 'norm2.bias': 'self_attn_layer_norm.bias', - 'linear1.weight': 'fc1.weight', - 'linear1.bias': 'fc1.bias', - 'linear2.weight': 'fc2.weight', - 'linear2.bias': 'fc2.bias', - 'norm3.weight': 'final_layer_norm.weight', - 'norm3.bias': 'final_layer_norm.bias', - } - for layer_num in range(config.decoder_layers): - source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.' - target_prefix_decoder = f'model.decoder.layers.{layer_num}.' - - for source_name, target_name in key_mappings_decoder.items(): - rename_keys.append((source_prefix_decoder + source_name, - target_prefix_decoder + target_name)) - ########################################## DECODER - END - - ########################################## Additional - START - for layer_name, params in state_dict.items(): - #### TEXT BACKBONE - if "bert" in layer_name: - rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE - if "input_proj" in layer_name: - rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision"))) - #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE - if "feat_map" in layer_name: - rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection"))) - #### DECODER REFERENCE POINT HEAD - if "transformer.decoder.ref_point_head" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", - "model.decoder.reference_points_head"))) - #### DECODER BBOX EMBED - if "transformer.decoder.bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", - "model.decoder.bbox_embed"))) - if "transformer.enc_output" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) - - if "transformer.enc_out_bbox_embed" in layer_name: - rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", - "model.encoder_output_bbox_embed"))) - - rename_keys.append(("transformer.level_embed", "model.level_embed")) - rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) - rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) - rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight")) - ########################################## Additional - END - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v_encoder(state_dict, config): - ########################################## VISION BACKBONE - START - embed_dim = config.backbone_config.embed_dim - for layer, depth in enumerate(config.backbone_config.depths): - hidden_size = embed_dim * 2**layer - for block in range(depth): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight" - ] = in_proj_weight[:hidden_size, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias" - ] = in_proj_bias[:hidden_size] - - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight" - ] = in_proj_weight[hidden_size : hidden_size * 2, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias" - ] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight" - ] = in_proj_weight[-hidden_size:, :] - state_dict[ - f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias" - ] = in_proj_bias[-hidden_size:] - ########################################## VISION BACKBONE - END - - -def read_in_q_k_v_text_enhancer(state_dict, config): - hidden_size = config.hidden_size - for idx in range(config.encoder_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[ - :hidden_size, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[ - -hidden_size:, : - ] - state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[ - -hidden_size: - ] - - -def read_in_q_k_v_decoder(state_dict, config): - hidden_size = config.hidden_size - for idx in range(config.decoder_layers): - # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] - - state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:] - - # read in weights + bias of cross-attention - in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight") - in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias") - - # next, add query, keys and values (in that order) to the state dict - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size] - - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[ - hidden_size : hidden_size * 2, : - ] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[ - hidden_size : hidden_size * 2 - ] - - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :] - state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:] - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image - - -def preprocess_caption(caption: str) -> str: - result = caption.lower().strip() - if result.endswith("."): - return result - return result + "." - - -@torch.no_grad() -def convert_grounding_dino_checkpoint(args): - model_name = args.model_name - pytorch_dump_folder_path = args.pytorch_dump_folder_path - push_to_hub = args.push_to_hub - verify_logits = args.verify_logits - - checkpoint_mapping = { - "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth", - "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth", - } - # Define default GroundingDino configuration - config = get_grounding_dino_config(model_name) - - # Load original checkpoint - checkpoint_url = checkpoint_mapping[model_name] - original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] - original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} - - for name, param in original_state_dict.items(): - print(name, param.shape) - - # Rename keys - new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(original_state_dict, config) - - for src, dest in rename_keys: - rename_key(new_state_dict, src, dest) - read_in_q_k_v_encoder(new_state_dict, config) - read_in_q_k_v_text_enhancer(new_state_dict, config) - read_in_q_k_v_decoder(new_state_dict, config) - - # Load HF model - model = GroundingDinoForObjectDetection(config) - model.eval() - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - # Load and process test image - image = prepare_img() - transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) - original_pixel_values = transforms(image).unsqueeze(0) - - image_processor = GroundingDinoImageProcessor() - tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") - processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer) - - text = "a cat" - inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt") - - assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4) - - if verify_logits: - # Running forward - with torch.no_grad(): - outputs = model(**inputs) - - print(outputs.logits[0, :3, :3]) - - expected_slice = torch.tensor( - [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] - ) - - assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) - print("Looks ok!") - - if pytorch_dump_folder_path is not None: - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model.push_to_hub(f"EduardoPacheco/{model_name}") - processor.push_to_hub(f"EduardoPacheco/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="grounding-dino-tiny", - type=str, - choices=["grounding-dino-tiny", "grounding-dino-base"], - help="Name of the GroundingDino model you'd like to convert.", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - parser.add_argument( - "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." - ) - - args = parser.parse_args() - convert_grounding_dino_checkpoint(args) diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py deleted file mode 100644 index 6bc2818498..0000000000 --- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py +++ /dev/null @@ -1,217 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Convert GroupViT checkpoints from the original repository. - -URL: https://github.com/NVlabs/GroupViT -""" - -import argparse - -import requests -import torch -from PIL import Image - -from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel - - -def rename_key(name): - # vision encoder - if "img_encoder.pos_embed" in name: - name = name.replace("img_encoder.pos_embed", "vision_model.embeddings.position_embeddings") - if "img_encoder.patch_embed.proj" in name: - name = name.replace("img_encoder.patch_embed.proj", "vision_model.embeddings.patch_embeddings.projection") - if "img_encoder.patch_embed.norm" in name: - name = name.replace("img_encoder.patch_embed.norm", "vision_model.embeddings.layernorm") - if "img_encoder.layers" in name: - name = name.replace("img_encoder.layers", "vision_model.encoder.stages") - if "blocks" in name and "res" not in name: - name = name.replace("blocks", "layers") - if "attn" in name and "pre_assign" not in name: - name = name.replace("attn", "self_attn") - if "proj" in name and "self_attn" in name and "text" not in name: - name = name.replace("proj", "out_proj") - if "pre_assign_attn.attn.proj" in name: - name = name.replace("pre_assign_attn.attn.proj", "pre_assign_attn.attn.out_proj") - if "norm1" in name: - name = name.replace("norm1", "layer_norm1") - if "norm2" in name and "pre_assign" not in name: - name = name.replace("norm2", "layer_norm2") - if "img_encoder.norm" in name: - name = name.replace("img_encoder.norm", "vision_model.layernorm") - # text encoder - if "text_encoder.token_embedding" in name: - name = name.replace("text_encoder.token_embedding", "text_model.embeddings.token_embedding") - if "text_encoder.positional_embedding" in name: - name = name.replace("text_encoder.positional_embedding", "text_model.embeddings.position_embedding.weight") - if "text_encoder.transformer.resblocks." in name: - name = name.replace("text_encoder.transformer.resblocks.", "text_model.encoder.layers.") - if "ln_1" in name: - name = name.replace("ln_1", "layer_norm1") - if "ln_2" in name: - name = name.replace("ln_2", "layer_norm2") - if "c_fc" in name: - name = name.replace("c_fc", "fc1") - if "c_proj" in name: - name = name.replace("c_proj", "fc2") - if "text_encoder" in name: - name = name.replace("text_encoder", "text_model") - if "ln_final" in name: - name = name.replace("ln_final", "final_layer_norm") - # projection layers - if "img_projector.linear_hidden." in name: - name = name.replace("img_projector.linear_hidden.", "visual_projection.") - if "img_projector.linear_out." in name: - name = name.replace("img_projector.linear_out.", "visual_projection.3.") - if "text_projector.linear_hidden" in name: - name = name.replace("text_projector.linear_hidden", "text_projection") - if "text_projector.linear_out" in name: - name = name.replace("text_projector.linear_out", "text_projection.3") - - return name - - -def convert_state_dict(orig_state_dict, config): - for key in orig_state_dict.copy().keys(): - val = orig_state_dict.pop(key) - - if "qkv" in key: - # weights and biases of the key, value and query projections of vision encoder's attention layers require special treatment: - # we need to split them up into separate matrices/vectors - key_split = key.split(".") - stage_num, layer_num = int(key_split[2]), int(key_split[4]) - dim = config.vision_config.hidden_size - if "weight" in key: - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.weight" - ] = val[:dim, :] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.weight" - ] = val[dim : dim * 2, :] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.weight" - ] = val[-dim:, :] - else: - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.q_proj.bias" - ] = val[:dim] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.k_proj.bias" - ] = val[dim : dim * 2] - orig_state_dict[ - f"vision_model.encoder.stages.{stage_num}.layers.{layer_num}.self_attn.v_proj.bias" - ] = val[-dim:] - elif "in_proj" in key: - # weights and biases of the key, value and query projections of text encoder's attention layers require special treatment: - # we need to split them up into separate matrices/vectors - key_split = key.split(".") - layer_num = int(key_split[3]) - dim = config.text_config.hidden_size - if "weight" in key: - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[ - dim : dim * 2, : - ] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :] - else: - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2] - orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:] - else: - new_name = rename_key(key) - # squeeze if necessary - if ( - "text_projection.0" in new_name - or "text_projection.3" in new_name - or "visual_projection.0" in new_name - or "visual_projection.3" in new_name - ): - orig_state_dict[new_name] = val.squeeze_() - else: - orig_state_dict[new_name] = val - - return orig_state_dict - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -@torch.no_grad() -def convert_groupvit_checkpoint( - checkpoint_path, pytorch_dump_folder_path, model_name="groupvit-gcc-yfcc", push_to_hub=False -): - """ - Copy/paste/tweak model's weights to the Transformers design. - """ - config = GroupViTConfig() - model = GroupViTModel(config).eval() - - state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"] - new_state_dict = convert_state_dict(state_dict, config) - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - assert missing_keys == ["text_model.embeddings.position_ids"] - assert (unexpected_keys == ["multi_label_logit_scale"]) or (len(unexpected_keys) == 0) - - # verify result - processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") - image = prepare_img() - inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, padding=True, return_tensors="pt") - - with torch.no_grad(): - outputs = model(**inputs) - - if model_name == "groupvit-gcc-yfcc": - expected_logits = torch.tensor([[13.3523, 6.3629]]) - elif model_name == "groupvit-gcc-redcaps": - expected_logits = torch.tensor([[16.1873, 8.6230]]) - else: - raise ValueError(f"Model name {model_name} not supported.") - assert torch.allclose(outputs.logits_per_image, expected_logits, atol=1e-3) - - processor.save_pretrained(pytorch_dump_folder_path) - model.save_pretrained(pytorch_dump_folder_path) - print("Successfully saved processor and model to", pytorch_dump_folder_path) - - if push_to_hub: - print("Pushing to the hub...") - processor.push_to_hub(model_name, organization="nielsr") - model.push_to_hub(model_name, organization="nielsr") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to dump the processor and PyTorch model." - ) - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to GroupViT checkpoint") - parser.add_argument( - "--model_name", - default="groupvit-gccy-fcc", - type=str, - help="Name of the model. Expecting either 'groupvit-gcc-yfcc' or 'groupvit-gcc-redcaps'", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the converted model and processor to the đŸ€— hub using the provided `model_name`.", - ) - args = parser.parse_args() - - convert_groupvit_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub) diff --git a/src/transformers/models/hiera/convert_hiera_to_hf.py b/src/transformers/models/hiera/convert_hiera_to_hf.py deleted file mode 100644 index eed27645b3..0000000000 --- a/src/transformers/models/hiera/convert_hiera_to_hf.py +++ /dev/null @@ -1,369 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hiera checkpoints from the original repository. - -URL: https://github.com/facebookresearch/hiera -""" - -import argparse -import json -import math -from typing import Dict, Tuple - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image -from torchvision import transforms - -from transformers import BitImageProcessor, HieraConfig, HieraForImageClassification, HieraForPreTraining, HieraModel -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config: HieraConfig, base_model: bool, mae_model: bool): - rename_keys = [] - # fmt: off - num_stages = len(config.depths) - # embedding dimensions for input and stages - dims = [config.embed_dim] + [int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(num_stages)] - - global_layer_idx = 0 - for stage_idx in range(num_stages): - dim_in = dims[stage_idx] - dim_out = dims[stage_idx + 1] - for layer_idx in range(config.depths[stage_idx]): - rename_keys.append((f"blocks.{global_layer_idx}.norm1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.norm1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_before.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.qkv.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.qkv.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.attn.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.attn.proj.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.norm2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.norm2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.layernorm_after.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc1.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.mlp.fc2.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.mlp.fc2.bias")) - - # projection layer only for the first layer of each stage boundary (except the first stage) - if dim_out != dim_in and layer_idx == 0: - rename_keys.append((f"blocks.{global_layer_idx}.proj.weight", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.weight")) - rename_keys.append((f"blocks.{global_layer_idx}.proj.bias", f"hiera.encoder.stages.{stage_idx}.layers.{layer_idx}.proj.bias")) - - global_layer_idx += 1 - - # projection layer + position embeddings - rename_keys.extend( - [ - ("patch_embed.proj.weight", "hiera.embeddings.patch_embeddings.projection.weight"), - ("patch_embed.proj.bias", "hiera.embeddings.patch_embeddings.projection.bias") - ] - ) - - rename_keys.append(("pos_embed", "hiera.embeddings.position_embeddings")) - - if base_model: - # layernorm + pooler - rename_keys.extend([("norm.weight", "pooler.layernorm.weight"), ("norm.bias", "pooler.layernorm.bias")]) - # if just the base model, we should remove "hiera" from all keys that start with "hiera" - rename_keys = [(pair[0], pair[1][6:]) if pair[1].startswith("hiera") else pair for pair in rename_keys] - elif mae_model: - rename_keys.extend( - [ - ("encoder_norm.weight", "encoder_norm.weight"), - ("encoder_norm.bias", "encoder_norm.bias"), - ("mask_token", "decoder.mask_token"), - ("decoder_pos_embed", "decoder.decoder_position_embeddings"), - ("decoder_norm.weight", "decoder.decoder_norm.weight"), - ("decoder_norm.bias", "decoder.decoder_norm.bias"), - ("decoder_pred.weight", "decoder.decoder_pred.weight"), - ("decoder_pred.bias", "decoder.decoder_pred.bias"), - ("decoder_embed.weight", "decoder.decoder_embeddings.weight"), - ("decoder_embed.bias", "decoder.decoder_embeddings.bias") - ] - ) - for i in range(config.decoder_depth): - rename_keys.extend( - [ - (f"decoder_blocks.{i}.norm1.weight", f"decoder.decoder_block.layers.{i}.layernorm_before.weight"), - (f"decoder_blocks.{i}.norm1.bias", f"decoder.decoder_block.layers.{i}.layernorm_before.bias"), - (f"decoder_blocks.{i}.attn.qkv.weight", f"decoder.decoder_block.layers.{i}.attn.qkv.weight"), - (f"decoder_blocks.{i}.attn.qkv.bias", f"decoder.decoder_block.layers.{i}.attn.qkv.bias"), - (f"decoder_blocks.{i}.attn.proj.weight", f"decoder.decoder_block.layers.{i}.attn.proj.weight"), - (f"decoder_blocks.{i}.attn.proj.bias", f"decoder.decoder_block.layers.{i}.attn.proj.bias"), - (f"decoder_blocks.{i}.norm2.weight", f"decoder.decoder_block.layers.{i}.layernorm_after.weight"), - (f"decoder_blocks.{i}.norm2.bias", f"decoder.decoder_block.layers.{i}.layernorm_after.bias"), - (f"decoder_blocks.{i}.mlp.fc1.weight", f"decoder.decoder_block.layers.{i}.mlp.fc1.weight"), - (f"decoder_blocks.{i}.mlp.fc1.bias", f"decoder.decoder_block.layers.{i}.mlp.fc1.bias"), - (f"decoder_blocks.{i}.mlp.fc2.weight", f"decoder.decoder_block.layers.{i}.mlp.fc2.weight"), - (f"decoder_blocks.{i}.mlp.fc2.bias", f"decoder.decoder_block.layers.{i}.mlp.fc2.bias"), - ] - ) - for i in range(config.num_query_pool): - rename_keys.extend( - [ - (f"multi_scale_fusion_heads.{i}.weight", f"multiscale_fusion.multi_scale_fusion_heads.{i}.weight"), - (f"multi_scale_fusion_heads.{i}.bias", f"multiscale_fusion.multi_scale_fusion_heads.{i}.bias") - ] - ) - else: - # layernorm + classification head - rename_keys.extend( - [ - ("norm.weight", "hiera.pooler.layernorm.weight"), - ("norm.bias", "hiera.pooler.layernorm.bias"), - ("head.projection.weight", "classifier.weight"), - ("head.projection.bias", "classifier.bias"), - ] - ) - # fmt: on - return rename_keys - - -def remove_classification_head_(state_dict): - ignore_keys = ["head.projection.weight", "head.projection.bias"] - for k in ignore_keys: - state_dict.pop(k, None) - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_labels_for_classifier(model_name: str) -> Tuple[Dict[int, str], Dict[str, int], int]: - repo_id = "huggingface/label-files" - - filename = "imagenet-1k-id2label.json" - - id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) - id2label = {int(k): v for k, v in id2label.items()} - label2id = {v: k for k, v in id2label.items()} - num_labels = len(id2label) - - return id2label, label2id, num_labels - - -def get_hiera_config(model_name: str, base_model: bool, mae_model: bool) -> HieraConfig: - if model_name == "hiera-tiny-224": - config = HieraConfig(depths=[1, 2, 7, 2]) - elif model_name == "hiera-small-224": - config = HieraConfig(depths=[1, 2, 11, 2]) - elif model_name == "hiera-base-224": - config = HieraConfig() - elif model_name == "hiera-base-plus-224": - config = HieraConfig(embed_dim=112, num_heads=[2, 4, 8, 16]) - elif model_name == "hiera-large-224": - config = HieraConfig(embed_dim=144, num_heads=[2, 4, 8, 16], depths=[2, 6, 36, 4]) - elif model_name == "hiera-huge-224": - config = HieraConfig(embed_dim=256, num_heads=[4, 8, 16, 32], depths=[2, 6, 36, 4]) - else: - raise ValueError(f"Unrecognized model name: {model_name}") - - if base_model: - pass - elif mae_model: - config.num_query_pool = 2 - config.decoder_hidden_size = 512 - config.decoder_depth = 8 - config.decoder_num_heads = 16 - # Table 3b from Hiera: A Hierarchical Vision Transformer without the Bells-and-Whistles - config.mask_ratio = 0.6 - else: - id2label, label2id, num_labels = get_labels_for_classifier(model_name) - config.id2label = id2label - config.label2id = label2id - config.num_labels = num_labels - - return config - - -@torch.no_grad() -def convert_hiera_checkpoint(args): - model_name = args.model_name - base_model = args.base_model - pytorch_dump_folder_path = args.pytorch_dump_folder_path - push_to_hub = args.push_to_hub - mae_model = args.mae_model - - config = get_hiera_config(model_name, base_model, mae_model) - - # Load original hiera model - original_model_name = model_name.replace("-", "_") - original_model_name = f"mae_{original_model_name}" if mae_model else original_model_name - - original_checkpoint_name = "mae_in1k_ft_in1k" if not (base_model or mae_model) else "mae_in1k" - - original_model = torch.hub.load( - "facebookresearch/hiera", - model=original_model_name, - pretrained=True, - checkpoint=original_checkpoint_name, - ) - - original_model.eval() - original_state_dict = original_model.state_dict() - # Don't need to remove head for MAE because original implementation doesn't have it on MAE - if base_model: - remove_classification_head_(original_state_dict) - - # # Rename keys - new_state_dict = original_state_dict.copy() - rename_keys = create_rename_keys(config, base_model, mae_model) - - for src, dest in rename_keys: - rename_key(new_state_dict, src, dest) - - # Load HF hiera model - if base_model: - model = HieraModel(config) - elif mae_model: - model = HieraForPreTraining(config) - else: - model = HieraForImageClassification(config) - - model.eval() - - missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) - print("Missing keys:", missing_keys) - print("Unexpected keys:", unexpected_keys) - - input_image = prepare_img() - - original_image_preprocessor = transforms.Compose( - [ - transforms.Resize(int((256 / 224) * 224), interpolation=transforms.functional.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ] - ) - - image_processor = BitImageProcessor( - image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size={"shortest_edge": 256} - ) - inputs = image_processor(images=input_image, return_tensors="pt") - - expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) - - input_image = prepare_img() - - inputs = image_processor(images=input_image, return_tensors="pt") - expected_pixel_values = original_image_preprocessor(input_image).unsqueeze(0) - assert torch.allclose(inputs.pixel_values, expected_pixel_values, atol=1e-4) - print("Pixel values look good!") - print(f"{inputs.pixel_values[0, :3, :3, :3]=}") - - # If is MAE we pass a noise to generate a random mask - mask_spatial_shape = [ - i // s // ms for i, s, ms in zip(config.image_size, config.patch_stride, config.masked_unit_size) - ] - num_windows = math.prod(mask_spatial_shape) - torch.manual_seed(2) - noise = torch.rand(1, num_windows) - outputs = model(**inputs) if not mae_model else model(noise=noise, **inputs) - # original implementation returns logits.softmax(dim=-1) - - if base_model: - expected_prob, expected_intermediates = original_model(expected_pixel_values, return_intermediates=True) - expected_last_hidden = expected_intermediates[-1] - batch_size, _, _, hidden_dim = expected_last_hidden.shape - expected_last_hidden = expected_last_hidden.reshape(batch_size, -1, hidden_dim) - assert torch.allclose(outputs.last_hidden_state, expected_last_hidden, atol=1e-3) - print("Base Model looks good as hidden states match original implementation!") - print(f"{outputs.last_hidden_state[0, :3, :3]=}") - elif mae_model: - # get mask from noise to be able to compare outputs - mask, _ = model.hiera.embeddings.patch_embeddings.random_masking(expected_pixel_values, noise) - expected_loss, _, _, _ = original_model(expected_pixel_values, mask=mask.bool()) - assert torch.allclose(outputs.loss, expected_loss, atol=1e-3) - print("MAE Model looks good as loss matches original implementation!") - else: - expected_prob = original_model(expected_pixel_values) - assert torch.allclose(outputs.logits.softmax(dim=-1), expected_prob, atol=1e-3) - print("Classifier looks good as probs match original implementation") - print(f"{outputs.logits[:, :5]=}") - - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - image_processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - hub_name = model_name - if base_model: - hub_name = model_name - elif mae_model: - hub_name = f"{model_name}-mae" - else: - hub_name = f"{model_name}-in1k" - repo_id = f"EduardoPacheco/{hub_name}" - print(f"Pushing model and processor for {model_name} to hub at {repo_id}") - model.push_to_hub(repo_id) - image_processor.push_to_hub(repo_id) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model-name", - default="hiera-tiny-224", - type=str, - choices=[ - "hiera-tiny-224", - "hiera-small-224", - "hiera-base-224", - "hiera-base-plus-224", - "hiera-large-224", - "hiera-huge-224", - ], - help="Name of the Hiera model you'd like to convert.", - ) - parser.add_argument( - "--pytorch-dump-folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--verify-logits", - action="store_true", - help="Whether or not to verify the logits against the original implementation.", - ) - parser.add_argument( - "--push-to-hub", action="store_true", help="Whether or not to push the converted model to the đŸ€— hub." - ) - parser.add_argument( - "--base-model", - action="store_true", - help="Whether to only convert the base model (no projection head weights).", - ) - parser.add_argument( - "--mae-model", action="store_true", help="Whether to convert to MAE checkpoint to HieraForPreTraining." - ) - - args = parser.parse_args() - convert_hiera_checkpoint(args) diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py deleted file mode 100644 index f5914f35c5..0000000000 --- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py +++ /dev/null @@ -1,222 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse - -import torch -from s3prl.hub import distilhubert - -from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -MAPPING = { - "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.conv", - "self_attn.k_proj": "encoder.layers.*.attention.k_proj", - "self_attn.v_proj": "encoder.layers.*.attention.v_proj", - "self_attn.q_proj": "encoder.layers.*.attention.q_proj", - "self_attn.out_proj": "encoder.layers.*.attention.out_proj", - "self_attn_layer_norm": "encoder.layers.*.layer_norm", - "fc1": "encoder.layers.*.feed_forward.intermediate_dense", - "fc2": "encoder.layers.*.feed_forward.output_dense", - "final_layer_norm": "encoder.layers.*.final_layer_norm", - "encoder.layer_norm": "encoder.layer_norm", - "mask_emb": "masked_spec_embed", -} - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - assert hf_shape == value.shape, ( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") - - -def recursively_load_weights(fairseq_model, hf_model): - unused_weights = [] - fairseq_dict = fairseq_model.state_dict() - - feature_extractor = hf_model.feature_extractor - - for name, value in fairseq_dict.items(): - is_used = False - if "conv_layers" in name: - load_conv_layer( - name, - value, - feature_extractor, - unused_weights, - hf_model.config.feat_extract_norm == "group", - ) - is_used = True - else: - for key, mapped_key in MAPPING.items(): - mapped_key = mapped_key - - if key in name: - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight" in name: - weight_type = "weight" - elif "bias" in name: - weight_type = "bias" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm): - name = full_name.split("conv_layers.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - if type_id == 0: - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.bias.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.weight.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm): - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, ( - f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was" - " found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - else: - unused_weights.append(full_name) - - -def convert_config(model): - config = HubertConfig() - fs_config = model.config - - config.activation_dropout = fs_config.activation_dropout - config.apply_spec_augment = False - config.attention_dropout = fs_config.attention_dropout - config.conv_bias = False - conv_layers = eval(fs_config.extractor_conv_feature_layers) - config.conv_dim = [x[0] for x in conv_layers] - config.conv_kernel = [x[1] for x in conv_layers] - config.conv_stride = [x[2] for x in conv_layers] - config.feat_extract_activation = "gelu" - config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group" - config.feat_proj_layer_norm = False - config.feat_proj_dropout = 0.0 - config.final_dropout = 0.0 - config.hidden_act = fs_config.activation_fn - config.hidden_dropout = fs_config.dropout - config.hidden_size = fs_config.encoder_embed_dim - config.initializer_range = 0.02 - config.intermediate_size = fs_config.encoder_ffn_embed_dim - config.layer_norm_eps = 1e-5 - config.layerdrop = 0.0 - config.num_attention_heads = fs_config.encoder_attention_heads - config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups - config.num_conv_pos_embeddings = fs_config.conv_pos - config.num_feat_extract_layers = len(conv_layers) - config.num_hidden_layers = fs_config.encoder_layers - - return config - - -@torch.no_grad() -def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None): - """ - Copy/paste/tweak model's weights to transformers design. - """ - model = distilhubert().model.model - - if config_path is not None: - config = HubertConfig.from_pretrained(config_path) - else: - config = convert_config(model) - model = model.eval() - - feature_extractor = Wav2Vec2FeatureExtractor( - feature_size=1, - sampling_rate=16000, - padding_value=0, - do_normalize=False, - return_attention_mask=False, - ) - hf_model = HubertModel(config) - - recursively_load_weights(model, hf_model) - - feature_extractor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - args = parser.parse_args() - convert_hubert_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 4966340493..0000000000 --- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,261 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse -import json -import os - -import fairseq -import torch -from fairseq.data import Dictionary - -from transformers import ( - HubertConfig, - HubertForCTC, - HubertModel, - Wav2Vec2CTCTokenizer, - Wav2Vec2FeatureExtractor, - Wav2Vec2Processor, - logging, -) - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -MAPPING = { - "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.batch_norm", - "encoder.pos_conv.1": "encoder.pos_conv_embed.conv", - "self_attn.k_proj": "encoder.layers.*.attention.k_proj", - "self_attn.v_proj": "encoder.layers.*.attention.v_proj", - "self_attn.q_proj": "encoder.layers.*.attention.q_proj", - "self_attn.out_proj": "encoder.layers.*.attention.out_proj", - "self_attn_layer_norm": "encoder.layers.*.layer_norm", - "fc1": "encoder.layers.*.feed_forward.intermediate_dense", - "fc2": "encoder.layers.*.feed_forward.output_dense", - "final_layer_norm": "encoder.layers.*.final_layer_norm", - "encoder.layer_norm": "encoder.layer_norm", - "w2v_model.layer_norm": "feature_projection.layer_norm", - "w2v_encoder.proj": "lm_head", - "mask_emb": "masked_spec_embed", -} - - -def set_recursively(hf_pointer, key, value, full_name, weight_type): - for attribute in key.split("."): - hf_pointer = getattr(hf_pointer, attribute) - - if weight_type is not None: - hf_shape = getattr(hf_pointer, weight_type).shape - else: - hf_shape = hf_pointer.shape - - assert hf_shape == value.shape, ( - f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be" - f" {value.shape} for {full_name}" - ) - - if weight_type == "weight": - hf_pointer.weight.data = value - elif weight_type == "weight_g": - hf_pointer.weight_g.data = value - elif weight_type == "weight_v": - hf_pointer.weight_v.data = value - elif weight_type == "bias": - hf_pointer.bias.data = value - elif weight_type == "running_mean": - hf_pointer.running_mean.data = value - elif weight_type == "running_var": - hf_pointer.running_var.data = value - elif weight_type == "num_batches_tracked": - hf_pointer.num_batches_tracked.data = value - else: - hf_pointer.data = value - - logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.") - - -def recursively_load_weights(fairseq_model, hf_model, is_finetuned): - unused_weights = [] - fairseq_dict = fairseq_model.state_dict() - - feature_extractor = hf_model.hubert.feature_extractor if is_finetuned else hf_model.feature_extractor - - for name, value in fairseq_dict.items(): - is_used = False - if "conv_layers" in name: - load_conv_layer( - name, - value, - feature_extractor, - unused_weights, - hf_model.config.feat_extract_norm == "group", - ) - is_used = True - else: - for key, mapped_key in MAPPING.items(): - mapped_key = "hubert." + mapped_key if (is_finetuned and mapped_key != "lm_head") else mapped_key - - if key in name or (key.split("w2v_model.")[-1] == name.split(".")[0] and not is_finetuned): - is_used = True - if "*" in mapped_key: - layer_index = name.split(key)[0].split(".")[-2] - mapped_key = mapped_key.replace("*", layer_index) - if "weight_g" in name: - weight_type = "weight_g" - elif "weight_v" in name: - weight_type = "weight_v" - elif "weight" in name: - weight_type = "weight" - elif "bias" in name: - weight_type = "bias" - elif "running_mean" in name: - weight_type = "running_mean" - elif "running_var" in name: - weight_type = "running_var" - elif "num_batches_tracked" in name: - weight_type = "num_batches_tracked" - else: - weight_type = None - set_recursively(hf_model, mapped_key, value, name, weight_type) - continue - if not is_used: - unused_weights.append(name) - - logger.warning(f"Unused weights: {unused_weights}") - - -def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm): - name = full_name.split("conv_layers.")[-1] - items = name.split(".") - layer_id = int(items[0]) - type_id = int(items[1]) - - if type_id == 0: - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.bias.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].conv.weight.data = value - logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.") - elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm): - if "bias" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, ( - f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was" - " found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - elif "weight" in name: - assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, ( - f"{full_name} has size {value.shape}, but" - f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found." - ) - feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value - logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.") - else: - unused_weights.append(full_name) - - -@torch.no_grad() -def convert_hubert_checkpoint( - checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True -): - """ - Copy/paste/tweak model's weights to transformers design. - """ - if config_path is not None: - config = HubertConfig.from_pretrained(config_path) - else: - config = HubertConfig() - - if is_finetuned: - if dict_path: - target_dict = Dictionary.load(dict_path) - - # important change bos & pad token id since CTC symbol is and - # not as in fairseq - config.bos_token_id = target_dict.pad_index - config.pad_token_id = target_dict.bos_index - config.eos_token_id = target_dict.eos_index - config.vocab_size = len(target_dict.symbols) - vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") - if not os.path.isdir(pytorch_dump_folder_path): - logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path)) - return - os.makedirs(pytorch_dump_folder_path, exist_ok=True) - with open(vocab_path, "w", encoding="utf-8") as vocab_handle: - json.dump(target_dict.indices, vocab_handle) - tokenizer = Wav2Vec2CTCTokenizer( - vocab_path, - unk_token=target_dict.unk_word, - pad_token=target_dict.pad_word, - bos_token=target_dict.bos_word, - eos_token=target_dict.eos_word, - word_delimiter_token="|", - do_lower_case=False, - ) - return_attention_mask = True if config.feat_extract_norm == "layer" else False - feature_extractor = Wav2Vec2FeatureExtractor( - feature_size=1, - sampling_rate=16000, - padding_value=0, - do_normalize=True, - return_attention_mask=return_attention_mask, - ) - processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) - processor.save_pretrained(pytorch_dump_folder_path) - - hf_wav2vec = HubertForCTC(config) - else: - hf_wav2vec = HubertModel(config) - - if is_finetuned: - model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])} - ) - else: - model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path]) - - model = model[0].eval() - - recursively_load_weights(model, hf_wav2vec, is_finetuned) - - hf_wav2vec.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint") - parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model") - parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") - parser.add_argument( - "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not" - ) - args = parser.parse_args() - convert_hubert_checkpoint( - args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned - ) diff --git a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py deleted file mode 100644 index c66c41ce36..0000000000 --- a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert Hubert checkpoint.""" - -import argparse - -import torch - -from transformers import HubertConfig, HubertForSequenceClassification, Wav2Vec2FeatureExtractor, logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -SUPPORTED_MODELS = ["UtteranceLevel"] - - -@torch.no_grad() -def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path): - """ - Copy/paste/tweak model's weights to transformers design. - """ - checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True) - if checkpoint["Config"]["downstream_expert"]["modelrc"]["select"] not in SUPPORTED_MODELS: - raise NotImplementedError(f"The supported s3prl models are {SUPPORTED_MODELS}") - - downstream_dict = checkpoint["Downstream"] - - hf_congfig = HubertConfig.from_pretrained(config_path) - hf_model = HubertForSequenceClassification.from_pretrained(base_model_name, config=hf_congfig) - hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( - base_model_name, return_attention_mask=True, do_normalize=False - ) - - if hf_congfig.use_weighted_layer_sum: - hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"] - - hf_model.projector.weight.data = downstream_dict["projector.weight"] - hf_model.projector.bias.data = downstream_dict["projector.bias"] - hf_model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"] - hf_model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"] - - hf_feature_extractor.save_pretrained(model_dump_path) - hf_model.save_pretrained(model_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model." - ) - parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.") - parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.") - parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.") - args = parser.parse_args() - convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path) diff --git a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py b/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py deleted file mode 100644 index ea44ee11e5..0000000000 --- a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import copy - -import torch -from accelerate import init_empty_weights - -from transformers import ( - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - Idefics2Config, - Idefics2ForConditionalGeneration, - Idefics2ImageProcessor, - Idefics2Processor, - MistralConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py --original_model_id HuggingFaceM4/idefics2-8b --output_hub_path org/idefics2 -""" - - -KEYS_TO_MODIFY_MAPPING = { - "lm_head.weight": "lm_head.linear.weight", - "model.layers": "model.text_model.layers", - "model.norm": "model.text_model.norm", - "model.perceiver_resampler": "model.connector.perceiver_resampler", - "model.modality_projection": "model.connector.modality_projection", -} - - -WEIGHTS_TO_MERGE_MAPPING = ( - # (weights to merge in merging order), (new weight name) - ( - ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"), - "model.text_model.embed_tokens.weight", - ), - (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"), -) - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - for key, value in state_dict.items(): - if key.endswith(".inv_freq"): - continue - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - new_state_dict[key] = value - return new_state_dict - - -def merge_weights(state_dict): - new_state_dict = copy.deepcopy(state_dict) - - # Merge the weights - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - assert weight in state_dict, f"Weight {weight} is missing in the state dict" - if new_weight_name not in new_state_dict: - new_state_dict[new_weight_name] = [state_dict[weight]] - else: - new_state_dict[new_weight_name].append(state_dict[weight]) - new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0) - - # Remove the weights that were merged - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - if weight in new_state_dict and weight != new_weight_name: - new_state_dict.pop(weight) - - return new_state_dict - - -def get_config(checkpoint): - if checkpoint == "HuggingFaceM4/idefics2": - # We load the config then recreate to use the text_config - config = AutoConfig.from_pretrained(checkpoint) - text_config = MistralConfig( - vocab_size=config.vocab_size + config.additional_vocab_size, - hidden_size=config.hidden_size, - intermediate_size=config.intermediate_size, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - num_key_value_heads=config.num_key_value_heads, - hidden_act=config.hidden_act, - max_position_embeddings=config.max_position_embeddings, - initializer_range=config.initializer_range, - rms_norm_eps=config.rms_norm_eps, - tie_word_embeddings=config.tie_word_embeddings, - rope_theta=config.rope_theta, - sliding_window=config.sliding_window, - attention_dropout=config.attention_dropout, - pad_token_id=config.pad_token_id, - bos_token_id=config.bos_token_id, - eos_token_id=config.eos_token_id, - ) - perceiver_config = config.perceiver_config.to_dict() - config = Idefics2Config( - text_config=text_config.to_dict(), - vision_config=config.vision_config, - perceiver_config=perceiver_config, - use_cache=config.use_cache, - image_token_id=config.image_token_id, - tie_word_embeddings=config.tie_word_embeddings, - ) - return config - - return AutoConfig.from_pretrained(checkpoint) - - -def convert_idefics2_hub_to_hf(original_model_id, output_hub_path, push_to_hub): - # The original model maps to AutoModelForCausalLM, converted we map to Idefics2ForConditionalGeneration - original_model = AutoModelForCausalLM.from_pretrained(original_model_id, trust_remote_code=True) - # The original model doesn't use the idefics2 processing objects - image_seq_len = original_model.config.perceiver_config.resampler_n_latents - image_processor = Idefics2ImageProcessor() - tokenizer = AutoTokenizer.from_pretrained(original_model_id) - processor = Idefics2Processor( - image_processor=image_processor, - tokenizer=tokenizer, - image_seq_len=image_seq_len, - ) - state_dict = original_model.state_dict() - state_dict = convert_state_dict_to_hf(state_dict) - - # Merge weights - state_dict = merge_weights(state_dict) - - config = get_config(original_model_id) - - with init_empty_weights(): - model = Idefics2ForConditionalGeneration(config) - - model.load_state_dict(state_dict, strict=True, assign=True) - - model.save_pretrained(output_hub_path) - processor.save_pretrained(output_hub_path) - - if push_to_hub: - model.push_to_hub(output_hub_path, private=True) - processor.push_to_hub(output_hub_path, private=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--original_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="If set, the model will be pushed to the hub after conversion.", - ) - args = parser.parse_args() - convert_idefics2_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py b/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py deleted file mode 100644 index 204104a58b..0000000000 --- a/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import json - -import torch -from accelerate import init_empty_weights -from huggingface_hub import hf_hub_download - -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - Idefics3Config, - Idefics3ForConditionalGeneration, - Idefics3ImageProcessor, - Idefics3Processor, - LlamaConfig, -) - - -EPILOG_TXT = """Example: - python transformers/src/transformers/models/idefics3/convert_idefics3_weights_to_hf.py --original_model_id HuggingFaceM4/Idefics3-8B-Llama3 --output_hub_path org/idefics3 -""" - - -KEYS_TO_MODIFY_MAPPING = { - "lm_head.weight": "lm_head.linear.weight", - "model.layers": "model.text_model.layers", - "model.norm": "model.text_model.norm", - "model.modality_projection": "model.connector.modality_projection", -} - - -WEIGHTS_TO_MERGE_MAPPING = ( - # (weights to merge in merging order), (new weight name) - ( - ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"), - "model.text_model.embed_tokens.weight", - ), - (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"), -) - -WEIGHTS_TO_DROP = ( - # The original model had a vision head, but this is never used - "model.vision_model.head", -) - - -def convert_state_dict_to_hf(state_dict): - new_state_dict = {} - old_state_dict_keys = set(state_dict.keys()) - - # Flattened list of weights to merge. We keep these in the original state dict to merge them later - original_weights_to_merge = [w for weights in WEIGHTS_TO_MERGE_MAPPING for w in weights[0]] - - # for key, value in state_dict.items(): - for old_key in old_state_dict_keys: - if old_key.endswith(".inv_freq") or any(w in old_key for w in WEIGHTS_TO_DROP): - state_dict.pop(old_key) - continue - - key = old_key - for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in key: - key = key.replace(key_to_modify, new_key) - - weight = state_dict.pop(old_key) - if key in original_weights_to_merge: - new_state_dict[key] = weight - # Bit of a hack - we need to keep the original weights to merge them later - state_dict[key] = weight - else: - new_state_dict[key] = weight - - return new_state_dict - - -def merge_weights(state_dict, new_state_dict): - old_weight_names = set(state_dict.keys()) - - # Merge the weights - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight_to_merge in weights_to_merge: - print(weight_to_merge) - assert weight_to_merge in state_dict, f"Weight {weight_to_merge} is missing in the state dict" - - weight = state_dict.pop(weight_to_merge) - if new_weight_name not in new_state_dict: - new_state_dict[new_weight_name] = [weight] - else: - new_state_dict[new_weight_name].append(weight) - - old_weight_names.remove(weight_to_merge) - - new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0) - - # Remove the weights that were merged - for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: - for weight in weights_to_merge: - if weight in new_state_dict and weight != new_weight_name: - new_state_dict.pop(weight) - - return new_state_dict - - -def get_config(checkpoint): - # We load the config then recreate to use the text_config - - # download the config file - filepath = hf_hub_download(repo_id=checkpoint, filename="config.json") - with open(filepath, "r") as f: - config_json = json.load(f) - - # Setup the vision config - vision_config = config_json.pop("vision_config") - vision_config.pop("vision_model_name", None) - if "embed_dim" in vision_config: - vision_config["hidden_size"] = vision_config.pop("embed_dim") - - config_json["vocab_size"] = config_json.pop("vocab_size") + config_json.pop("additional_vocab_size") - - image_token_id = config_json.pop("image_token_id", config_json["vocab_size"] - 2) - use_cache = config_json.pop("use_cache", True) - tie_word_embeddings = config_json.pop("tie_word_embeddings", True) - scale_factor = config_json.pop("scale_factor", 2) - vocab_size = config_json.pop("vocab_size", 100000) - - # Remove "freeze" params from the config - config_json = {k: v for k, v in config_json.items() if not k.startswith("freeze_")} - text_config = LlamaConfig(**config_json) - - config = Idefics3Config( - text_config=text_config, - vision_config=vision_config, - use_cache=use_cache, - image_token_id=image_token_id, - tie_word_embeddings=tie_word_embeddings, - scale_factor=scale_factor, - vocab_size=vocab_size, - ) - return config - - -def convert_idefics3_hub_to_hf(original_model_id, output_hub_path, push_to_hub): - # The original model maps to AutoModelForCausalLM, converted we map to Idefics3ForConditionalGeneration - original_model = AutoModelForCausalLM.from_pretrained( - original_model_id, trust_remote_code=True, torch_dtype=torch.bfloat16 - ) - # The original model doesn't use the Idefics3 processing objects - image_processor = Idefics3ImageProcessor() - tokenizer = AutoTokenizer.from_pretrained(original_model_id) - processor = Idefics3Processor( - image_processor=image_processor, - tokenizer=tokenizer, - ) - state_dict = original_model.state_dict() - new_state_dict = convert_state_dict_to_hf(state_dict) - - # Merge weights - new_state_dict = merge_weights(state_dict, new_state_dict) - del state_dict - - config = get_config(original_model_id) - print(config) - - with init_empty_weights(): - model = Idefics3ForConditionalGeneration(config) - - model.load_state_dict(new_state_dict, strict=True, assign=True) - - model.save_pretrained(output_hub_path) - processor.save_pretrained(output_hub_path) - - if push_to_hub: - model.push_to_hub(output_hub_path, private=True) - processor.push_to_hub(output_hub_path, private=True) - - -def main(): - parser = argparse.ArgumentParser( - epilog=EPILOG_TXT, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - parser.add_argument( - "--original_model_id", - help="Hub location of the text model", - ) - parser.add_argument( - "--output_hub_path", - help="Location on the hub of the converted model", - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="If set, the model will be pushed to the hub after conversion.", - ) - args = parser.parse_args() - convert_idefics3_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/ijepa/convert_ijepa_to_hf.py b/src/transformers/models/ijepa/convert_ijepa_to_hf.py deleted file mode 100644 index 25d97df6ce..0000000000 --- a/src/transformers/models/ijepa/convert_ijepa_to_hf.py +++ /dev/null @@ -1,268 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert IJEPA checkpoints from the original repository. - -URL: https://github.com/facebookresearch/ijepa -""" - -import argparse -import gc -import re -from pathlib import Path -from typing import Optional - -import requests -import torch -from PIL import Image - -from transformers import ( - IJepaConfig, - IJepaModel, - ViTImageProcessor, -) -from transformers.utils import logging - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # Projection layer + position embeddings - r"pos_embed": r"embeddings.position_embeddings", - r"patch_embed.proj.weight": r"embeddings.patch_embeddings.projection.weight", - r"patch_embed.proj.bias": r"embeddings.patch_embeddings.projection.bias", - - # Encoder layers: Layernorms, Attention, Feedforward layers - r"blocks.(\d+).norm1.weight": r"encoder.layer.\1.layernorm_before.weight", - r"blocks.(\d+).norm1.bias": r"encoder.layer.\1.layernorm_before.bias", - r"blocks.(\d+).attn.proj.weight": r"encoder.layer.\1.attention.output.dense.weight", - r"blocks.(\d+).attn.proj.bias": r"encoder.layer.\1.attention.output.dense.bias", - r"blocks.(\d+).norm2.weight": r"encoder.layer.\1.layernorm_after.weight", - r"blocks.(\d+).norm2.bias": r"encoder.layer.\1.layernorm_after.bias", - r"blocks.(\d+).mlp.fc1.weight": r"encoder.layer.\1.intermediate.dense.weight", - r"blocks.(\d+).mlp.fc1.bias": r"encoder.layer.\1.intermediate.dense.bias", - r"blocks.(\d+).mlp.fc2.weight": r"encoder.layer.\1.output.dense.weight", - r"blocks.(\d+).mlp.fc2.bias": r"encoder.layer.\1.output.dense.bias", - - # Layernorm + pooler - r"norm.weight": r"layernorm.weight", - r"norm.bias": r"layernorm.bias", -} -# fmt: on - - -def convert_old_keys_to_new_keys(state_dict_keys: Optional[dict] = None): - """ - Converts old keys to new keys using the mapping and dynamically removes the 'ijepa.' prefix if necessary. - - Args: - state_dict_keys (dict): The keys from the state_dict to convert. - - Returns: - dict: A mapping from old keys to new keys. - """ - output_dict = {} - if state_dict_keys is not None: - old_text = "\n".join(state_dict_keys) - new_text = old_text - - # Apply regex-based mapping - for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): - if replacement is None: - new_text = re.sub(pattern, "", new_text) # Skip the key - continue - new_text = re.sub(pattern, replacement, new_text) - - output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) - - return output_dict - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - for i in range(config.num_hidden_layers): - # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) - in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") - in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") - # next, add query, keys and values (in that order) to the state dict - state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] - state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] - state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ - config.hidden_size : config.hidden_size * 2, : - ] - state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ - config.hidden_size : config.hidden_size * 2 - ] - state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] - state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -# We will verify our results on an image of cute cats -def prepare_img(): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - im = Image.open(requests.get(url, stream=True).raw) - return im - - -def get_ijepa_config(model_name): - patch_size = int(model_name.split("_")[1][4:]) - config = IJepaConfig(patch_size=patch_size) - if "vith" in model_name: - config.hidden_size = 1280 - config.num_hidden_layers = 32 - config.num_attention_heads = 16 - config.layer_norm_eps = 1e-6 - config.mlp_ratio = 4 - config.intermediate_size = 5120 - if model_name == "ijepa_vith16_1k": - config.image_size = 448 - elif "vitg" in model_name: - config.hidden_size = 1408 - config.num_hidden_layers = 40 - config.num_attention_heads = 16 - config.layer_norm_eps = 1e-6 - config.mlp_ratio = 48 / 11 - config.intermediate_size = 6144 - else: - raise ValueError("Model not supported, only supports huge and giant models.") - return config - - -@torch.no_grad() -def write_model(model_name, output_dir, safe_serialization, push_to_hub, verify_logits): - """ - Copy/paste/tweak model's weights to our IJEPA structure. - """ - - # define default IJEPA configuration - config = get_ijepa_config(model_name) - - checkpoint_mapping = { - "ijepa_vith14_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.14-300e.pth.tar", - "ijepa_vith14_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.h.14-900e.pth.tar", - "ijepa_vith16_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.16-448px-300e.pth.tar", - "ijepa_vitg16_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.g.16-600e.pth.tar", - } - - # Load original checkpoint - checkpoint_url = checkpoint_mapping[model_name] - original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["encoder"] - original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} - - # Rename keys - state_dict = original_state_dict.copy() - new_keys = convert_old_keys_to_new_keys(state_dict.keys()) - for old_key, new_key in new_keys.items(): - rename_key(state_dict, old_key, new_key) - read_in_q_k_v(state_dict, config) - - # load HuggingFace model - model = IJepaModel(config, add_pooling_layer=False).eval() - model.load_state_dict(state_dict) - size = {"height": config.image_size, "width": config.image_size} - image_processor = ViTImageProcessor(size=size) - - if verify_logits: - # Check outputs on an image, prepared by ViTImageProcessor - encoding = image_processor(images=prepare_img(), return_tensors="pt") - pixel_values = encoding["pixel_values"] - with torch.no_grad(): - outputs = model(pixel_values) - - expected_slices = { - "ijepa_vith14_1k": torch.Tensor( - [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]] - ), - "ijepa_vith14_22k": torch.Tensor( - [[0.0358, -0.0045, -0.2154], [0.0418, -0.0246, 0.0108], [0.2529, -0.0345, -0.0246]] - ), - "ijepa_vith16_1k": torch.Tensor( - [[0.5145, -0.1259, 0.0615], [0.1132, 0.0028, -0.0496], [1.1586, -0.0056, -0.0387]] - ), - "ijepa_vitg16_22k": torch.Tensor( - [[0.0512, -0.0510, -0.0649], [0.1972, 0.0380, -0.0790], [0.1667, -0.0834, -0.1240]] - ), - } - - assert torch.allclose( - expected_slices[model_name], - outputs.last_hidden_state[0, :3, :3], - atol=1e-4, - ) - - if output_dir: - Path(output_dir).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {output_dir}") - image_processor.save_pretrained(output_dir, safe_serialization=safe_serialization) - model.save_pretrained(output_dir, safe_serialization=safe_serialization) - - if push_to_hub: - image_processor.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) - model.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) - - if output_dir: - del model, state_dict - gc.collect() - print("Reloading the model to check if it's saved correctly.") - IJepaModel.from_pretrained(output_dir, device_map="auto") - print("Model reloaded successfully.") - - -def main(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--model_name", - default="ijepa_vith14_1k", - type=str, - choices=[ - "ijepa_vith14_1k", - "ijepa_vith14_22k", - "ijepa_vith16_1k", - "ijepa_vitg16_22k", - ], - help="Name of the model you'd like to convert.", - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - help="Path to the output PyTorch model directory.", - ) - parser.add_argument( - "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." - ) - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether or not to push the model to the đŸ€— Hub.", - ) - parser.add_argument( - "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." - ) - - parser.set_defaults() - args = parser.parse_args() - write_model(args.model_name, args.output_dir, args.safe_serialization, args.push_to_hub, args.verify_logits) - - -if __name__ == "__main__": - main() diff --git a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py deleted file mode 100644 index 182d66b9af..0000000000 --- a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert OpenAI Image GPT checkpoints.""" - -import argparse - -import torch - -from transformers import ImageGPTConfig, ImageGPTForCausalLM, load_tf_weights_in_imagegpt -from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging - - -logging.set_verbosity_info() - - -def convert_imagegpt_checkpoint_to_pytorch(imagegpt_checkpoint_path, model_size, pytorch_dump_folder_path): - # Construct configuration depending on size - MODELS = {"small": (512, 8, 24), "medium": (1024, 8, 36), "large": (1536, 16, 48)} - n_embd, n_head, n_layer = MODELS[model_size] # set model hyperparameters - config = ImageGPTConfig(n_embd=n_embd, n_layer=n_layer, n_head=n_head) - model = ImageGPTForCausalLM(config) - - # Load weights from numpy - load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path) - - # Save pytorch-model - pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME - pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME - print(f"Save PyTorch model to {pytorch_weights_dump_path}") - torch.save(model.state_dict(), pytorch_weights_dump_path) - print(f"Save configuration file to {pytorch_config_dump_path}") - with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: - f.write(config.to_json_string()) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--imagegpt_checkpoint_path", - default=None, - type=str, - required=True, - help="Path to the TensorFlow checkpoint path.", - ) - parser.add_argument( - "--model_size", - default=None, - type=str, - required=True, - help="Size of the model (can be either 'small', 'medium' or 'large').", - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_imagegpt_checkpoint_to_pytorch( - args.imagegpt_checkpoint_path, args.model_size, args.pytorch_dump_folder_path - ) diff --git a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py b/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py deleted file mode 100644 index f8b9c86cfd..0000000000 --- a/src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py +++ /dev/null @@ -1,303 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert InstructBLIP checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblip -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) -# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml -# same for Vicuna-13b -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BlipImageProcessor, - InstructBlipConfig, - InstructBlipForConditionalGeneration, - InstructBlipProcessor, - InstructBlipQFormerConfig, - InstructBlipVisionConfig, - LlamaConfig, - LlamaTokenizerFast, - T5Config, - T5TokenizerFast, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name): - image_size = 364 if "coco" in model_name else 224 - vision_config = InstructBlipVisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "vicuna-7b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() - elif "vicuna-13b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() - else: - raise ValueError("Model name not supported") - - # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 - qformer_config = InstructBlipQFormerConfig(vocab_size=30523).to_dict() - config = InstructBlipConfig(vision_config=vision_config, text_config=text_config, qformer_config=qformer_config) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") - qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - - if "t5" in model_name: - tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") - elif "vicuna" in model_name: - # the following was used in the original implementation: - # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") - # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - # tokenizer.add_special_tokens({"bos_token": ""}) - # tokenizer.add_special_tokens({"eos_token": ""}) - # tokenizer.add_special_tokens({"unk_token": ""}) - tokenizer = LlamaTokenizerFast.from_pretrained( - "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" - ) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - config, image_size = get_blip2_config(model_name) - hf_model = InstructBlipForConditionalGeneration(config).eval() - - model_name_to_original = { - "instructblip-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), - "instructblip-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), - "instructblip-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), - "instructblip-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" - lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "llm_proj" in key: - key = key.replace("llm_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("llm_model"): - key = key.replace("llm_model", "language_model") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - # note: weights get loaded in torch.float32 by default - hf_model.load_state_dict(state_dict, strict=True) - - image = load_demo_image() - prompt = "What is unusual about this image?" - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = InstructBlipProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - qformer_tokenizer=qformer_tokenizer, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) - - # make sure processor creates exact same pixel values - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - pixel_values = inputs.pixel_values - assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - with torch.no_grad(): - if "vicuna" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits - logits = hf_model(**inputs).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} - ).logits - label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) - labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(**inputs, labels=labels).logits - - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert original_logits.shape == logits.shape - atol = 1e-4 if "vicuna" in model_name else 1e-5 - assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) - print("Looks ok!") - - print("Generating with original model...") - original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) - - # important: we need to cast the weights of the HF model to the appropriate type - print("Generating with HF model...") - outputs = hf_model.generate( - **inputs, - do_sample=False, - num_beams=5, - max_length=256, - min_length=1, - top_p=0.9, - repetition_penalty=1.5, - length_penalty=1.0, - temperature=1, - ) - if "vicuna" in model_name: - # convert output id 0 to 2 (eos_token_id) - # TODO add this in the generate method? - outputs[outputs == 0] = 2 - print("Original generation:", original_outputs) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"Salesforce/{model_name}") - hf_model.push_to_hub(f"Salesforce/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "instructblip-vicuna-7b", - "instructblip-vicuna-13b", - "instructblip-flan-t5-xl", - "instructblip-flan-t5-xxl", - ] - parser.add_argument( - "--model_name", - default="instructblip-flan-t5-xl", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - - args = parser.parse_args() - - convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py b/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py deleted file mode 100644 index 9b3d508db6..0000000000 --- a/src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py +++ /dev/null @@ -1,305 +0,0 @@ -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Convert InstructBlipVideo checkpoints from the original repository. - -URL: https://github.com/salesforce/LAVIS/tree/main/projects/instructblipvideo -""" - -import argparse - -import requests -import torch - -# pip3 install salesforce-lavis -# I'm actually installing a slightly modified version: pip3 install git+https://github.com/nielsrogge/LAVIS.git@fix_lavis_float32 (there's also the fix_lavis branch) -# also note: to convert Vicuna checkpoints, we had to include /home/niels/python_projects/checkpoints/FastChat/vicuna-7b in lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml -# same for Vicuna-13b -from lavis.models import load_model_and_preprocess -from PIL import Image - -from transformers import ( - AutoTokenizer, - BlipImageProcessor, - InstructBlipProcessor, - InstructBlipVideoConfig, - InstructBlipVideoForConditionalGeneration, - InstructBlipVideoQFormerConfig, - InstructBlipVideoVisionConfig, - LlamaConfig, - LlamaTokenizerFast, - T5Config, - T5TokenizerFast, -) -from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD - - -def load_demo_image(): - url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - return image - - -# here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): - rename_keys = [] - # fmt: off - - # vision encoder - rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) - rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) - rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) - rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) - rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) - rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) - - for i in range(config.vision_config.num_hidden_layers): - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) - rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) - rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) - - # QFormer - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.embeddings.layernorm.weight")) - rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.embeddings.layernorm.bias")) - - # fmt: on - return rename_keys - - -def rename_key(dct, old, new): - val = dct.pop(old) - dct[new] = val - - -def read_in_q_v_bias(state_dict, config): - for i in range(config.vision_config.num_hidden_layers): - # read in original q and v biases - q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") - v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") - - # next, set bias in the state dict - qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) - state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias - - -def get_blip2_config(model_name): - image_size = 364 if "coco" in model_name else 224 - vision_config = InstructBlipVideoVisionConfig(image_size=image_size).to_dict() - - # make sure the models have proper bos_token_id and eos_token_id set (important for generation) - # seems like flan-T5 models don't have bos_token_id properly set? - if "t5-xl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "t5-xxl" in model_name: - text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - elif "vicuna-7b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-7b-hf", vocab_size=32001).to_dict() - elif "vicuna-13b" in model_name: - text_config = LlamaConfig.from_pretrained("decapoda-research/llama-13b-hf", vocab_size=32001).to_dict() - else: - raise ValueError("Model name not supported") - - # the authors add one special "[DEC]" token to the vocab of Q-Former, hence vocab size = 30522 + 1 - qformer_config = InstructBlipVideoQFormerConfig(vocab_size=30523).to_dict() - config = InstructBlipVideoConfig( - vision_config=vision_config, text_config=text_config, qformer_config=qformer_config - ) - - return config, image_size - - -@torch.no_grad() -def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): - """ - Copy/paste/tweak model's weights to Transformers design. - """ - qformer_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", truncation_side="left") - qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"}) - - if "t5" in model_name: - tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-xl", truncation_side="left") - elif "vicuna" in model_name: - # the following was used in the original implementation: - # tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", use_fast=False, truncation_side="left") - # tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - # tokenizer.add_special_tokens({"bos_token": ""}) - # tokenizer.add_special_tokens({"eos_token": ""}) - # tokenizer.add_special_tokens({"unk_token": ""}) - tokenizer = LlamaTokenizerFast.from_pretrained( - "huggyllama/llama-7b", truncation_side="left", bos_token="", unk_token="" - ) - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - config, image_size = get_blip2_config(model_name) - hf_model = InstructBlipVideoForConditionalGeneration(config).eval() - - model_name_to_original = { - "instructblipvideo-vicuna-7b": ("blip2_vicuna_instruct", "vicuna7b"), - "instructblipvideo-vicuna-13b": ("blip2_vicuna_instruct", "vicuna13b"), - "instructblipvideo-flan-t5-xl": ("blip2_t5_instruct", "flant5xl"), - "instructblipvideo-flan-t5-xxl": ("blip2_t5_instruct", "flant5xxl"), - } - - name, type = model_name_to_original[model_name] - - # load original model - print("Loading original model...") - hf_model_device = "cuda:1" if torch.cuda.is_available() else "cpu" - lavis_device = "cuda:2" if torch.cuda.is_available() else "cpu" - original_model, vis_processors, _ = load_model_and_preprocess( - name=name, model_type=type, is_eval=True, device=lavis_device - ) - original_model.eval() - print("Done!") - - # update state dict keys - state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) - for src, dest in rename_keys: - rename_key(state_dict, src, dest) - - # some keys can be renamed efficiently - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if key.startswith("Qformer.bert"): - key = key.replace("Qformer.bert", "qformer") - if "attention.self" in key: - key = key.replace("self", "attention") - if "llm_proj" in key: - key = key.replace("llm_proj", "language_projection") - if "t5_proj" in key: - key = key.replace("t5_proj", "language_projection") - if key.startswith("llm_model"): - key = key.replace("llm_model", "language_model") - if key.startswith("t5"): - key = key.replace("t5", "language") - state_dict[key] = val - - # read in qv biases - read_in_q_v_bias(state_dict, config) - - # note: weights get loaded in torch.float32 by default - hf_model.load_state_dict(state_dict, strict=True) - - image = load_demo_image() - prompt = "What is unusual about this image?" - - # create processor - image_processor = BlipImageProcessor( - size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD - ) - processor = InstructBlipProcessor( - image_processor=image_processor, - tokenizer=tokenizer, - qformer_tokenizer=qformer_tokenizer, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt").to(hf_model_device) - - # make sure processor creates exact same pixel values - original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - pixel_values = inputs.pixel_values - assert torch.allclose(original_pixel_values.to(pixel_values.device), pixel_values) - - original_model.to(lavis_device) - hf_model.to(hf_model_device) - with torch.no_grad(): - if "vicuna" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [prompt]}).logits - logits = hf_model(**inputs).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": [prompt], "text_output": ["\n"]} - ).logits - label_input_ids = tokenizer("\n", return_tensors="pt").input_ids.to(hf_model_device) - labels = label_input_ids.masked_fill(label_input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(**inputs, labels=labels).logits - - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - - # assert values - assert original_logits.shape == logits.shape - atol = 1e-4 if "vicuna" in model_name else 1e-5 - assert torch.allclose(original_logits.to(logits.device), logits, atol=atol) - print("Looks ok!") - - print("Generating with original model...") - original_outputs = original_model.generate({"image": original_pixel_values, "prompt": prompt}, num_beams=5) - - # important: we need to cast the weights of the HF model to the appropriate type - print("Generating with HF model...") - outputs = hf_model.generate( - **inputs, - do_sample=False, - num_beams=5, - max_length=256, - min_length=1, - top_p=0.9, - repetition_penalty=1.5, - length_penalty=1.0, - temperature=1, - ) - if "vicuna" in model_name: - # convert output id 0 to 2 (eos_token_id) - # TODO add this in the generate method? - outputs[outputs == 0] = 2 - print("Original generation:", original_outputs) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("HF generation:", output_text) - - if pytorch_dump_folder_path is not None: - processor.save_pretrained(pytorch_dump_folder_path) - hf_model.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - processor.push_to_hub(f"Salesforce/{model_name}") - hf_model.push_to_hub(f"Salesforce/{model_name}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - choices = [ - "instructblipvideo-vicuna-7b", - "instructblipvideo-vicuna-13b", - "instructblipvideo-flan-t5-xl", - "instructblipvideo-flan-t5-xxl", - ] - parser.add_argument( - "--model_name", - default="instructblipvideo-flan-t5-xl", - choices=choices, - type=str, - help="Path to hf config.json of model to convert", - ) - parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") - parser.add_argument( - "--push_to_hub", - action="store_true", - help="Whether to push the model and processor to the hub after converting", - ) - - args = parser.parse_args() - - convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py b/src/transformers/models/internvl/convert_internvl_weights_to_hf.py deleted file mode 100644 index f687a2e714..0000000000 --- a/src/transformers/models/internvl/convert_internvl_weights_to_hf.py +++ /dev/null @@ -1,421 +0,0 @@ -# coding=utf-8 -# Copyright 2025 HuggingFace Inc. team. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import gc -import os -import re -from typing import Optional - -import torch -from einops import rearrange - -from transformers import ( - AutoModel, - AutoTokenizer, - GenerationConfig, - GotOcr2ImageProcessorFast, - InternVLConfig, - InternVLForConditionalGeneration, - InternVLProcessor, - InternVLVisionConfig, - LlamaConfig, - Qwen2Config, -) - - -LM_TYPE_CORRESPONDENCE = { - "OpenGVLab/InternVL2_5-1B-MPO": "qwen2", - "OpenGVLab/InternVL2_5-2B-MPO": "llama", - "OpenGVLab/InternVL2_5-4B-MPO": "qwen2", - "OpenGVLab/InternVL2_5-8B-MPO": "llama", - "OpenGVLab/InternVL2_5-26B-MPO": "llama", - "OpenGVLab/InternVL2_5-38B-MPO": "qwen2", - "OpenGVLab/InternVL2_5-78B-MPO": "qwen2", - "OpenGVLab/InternVL3-1B": "qwen2", - "OpenGVLab/InternVL3-2B": "qwen2", - "OpenGVLab/InternVL3-8B": "qwen2", - "OpenGVLab/InternVL3-9B": "llama", - "OpenGVLab/InternVL3-14B": "qwen2", - "OpenGVLab/InternVL3-38B": "qwen2", - "OpenGVLab/InternVL3-78B": "qwen2", -} - -UNNECESSARY_CONFIG_KEYS = [ "_name_or_path", "_attn_implementation_autoset", "auto_map", "use_bfloat16", "use_flash_attn", "bias", "laux_allreduce", "moe_coeff_ratio", "moe_intermediate_size", "moe_output_scale", "noisy_gate_policy", "shared_expert_intermediate_size", "use_residual", "use_moe", "use_rts", "use_weighted_residual", "moe_config", "num_experts", "num_routed_experts", "num_shared_experts", "capacity_factor", "eval_capacity_factor", "drop_path_rate"] # fmt: skip - -# fmt: off -ORIGINAL_TO_CONVERTED_KEY_MAPPING_VISION = { - # Vision encoder mapping - r"vision_model": r"vision_tower", - r"layers": r"layer", - r"class_embedding": r"cls_token", - r"position_embedding": r"position_embeddings", - r"patch_embedding": r"patch_embeddings.projection", - r"ls(\d+)": r"lambda_\1", - r"attn.proj": r"attention.projection_layer", - r"attn.dropout": r"attention.projection_dropout", - r"attn": r"attention", - r"norm1": r"layernorm_before", - r"norm2": r"layernorm_after", - -} - -ORIGINAL_TO_CONVERTED_KEY_MAPPING_TEXT_LLAMA = { - # Vision encoder mapping - r"tok_embeddings": r"embed_tokens", - r"attention.wo": r"self_attn.o_proj", - r"feed_forward.w1": r"mlp.gate_proj", - r"feed_forward.w2": r"mlp.down_proj", - r"feed_forward.w3": r"mlp.up_proj", - r"attention_norm": r"input_layernorm", - r"ffn_norm": r"post_attention_layernorm", - r"output": r"lm_head", -} - -ORIGINAL_TO_CONVERTED_KEY_MAPPING_MULTI = { - # Vision encoder mapping - r"mlp1.0": r"multi_modal_projector.layer_norm", - r"mlp1.1": r"multi_modal_projector.linear_1", - r"mlp1.3": r"multi_modal_projector.linear_2", -} - - -chat_template = ( - "{% for message in messages %}" - "{{'<|im_start|>' + message['role'] + '\n'}}" - "{% if message['content'] is string %}" - "{{ message['content'] }}" - "{% else %}" - "{% for content in message['content'] %}" - "{% if content['type'] == 'image' %}" - "{{ '\n' }}" - "{% elif content['type'] == 'video' %}" - "{{ '