From 4975002df50c472cbb6f8ac3580e475f570606ab Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Wed, 23 Mar 2022 10:26:33 -0400
Subject: [PATCH] Reorganize file utils (#16264)

* Split file_utils in several submodules

* Fixes

* Add back more objects

* More fixes

* Who exactly decided to import that from there?

* Second suggestion to code with code review

* Revert wront move

* Fix imports

* Adapt all imports

* Adapt all imports everywhere

* Revert this import, will fix in a separate commit
---
 ISSUES.md                                     |    4 +-
 .../run_image_captioning_flax.py              |    2 +-
 .../flax/language-modeling/run_clm_flax.py    |    2 +-
 .../flax/language-modeling/run_mlm_flax.py    |    2 +-
 .../flax/language-modeling/run_t5_mlm_flax.py |    2 +-
 examples/flax/question-answering/run_qa.py    |    3 +-
 .../summarization/run_summarization_flax.py   |    2 +-
 .../flax/text-classification/run_flax_glue.py |    3 +-
 .../flax/token-classification/run_flax_ner.py |    3 +-
 .../flax/vision/run_image_classification.py   |    2 +-
 .../seq2seq/old_test_tatoeba_conversion.py    |    2 +-
 examples/legacy/seq2seq/seq2seq_trainer.py    |    2 +-
 examples/legacy/seq2seq/utils.py              |    2 +-
 .../language-modeling/run_clm_no_trainer.py   |    2 +-
 .../language-modeling/run_mlm_no_trainer.py   |    2 +-
 examples/pytorch/multiple-choice/run_swag.py  |    3 +-
 .../multiple-choice/run_swag_no_trainer.py    |    2 +-
 .../run_qa_beam_search_no_trainer.py          |    3 +-
 .../question-answering/run_qa_no_trainer.py   |    3 +-
 .../run_wav2vec2_pretraining_no_trainer.py    |    2 +-
 .../summarization/run_summarization.py        |    3 +-
 .../run_summarization_no_trainer.py           |    2 +-
 examples/pytorch/test_pytorch_examples.py     |    2 +-
 .../run_glue_no_trainer.py                    |    2 +-
 .../run_ner_no_trainer.py                     |    2 +-
 .../translation/run_translation_no_trainer.py |    2 +-
 .../tensorflow/multiple-choice/run_swag.py    |    3 +-
 .../tensorflow/question-answering/run_qa.py   |    3 +-
 .../summarization/run_summarization.py        |    3 +-
 .../run_text_classification.py                |    2 +-
 src/transformers/__init__.py                  |  147 +-
 src/transformers/benchmark/benchmark.py       |    3 +-
 src/transformers/benchmark/benchmark_args.py  |    3 +-
 .../benchmark/benchmark_args_tf.py            |    3 +-
 src/transformers/benchmark/benchmark_tf.py    |    3 +-
 src/transformers/benchmark/benchmark_utils.py |    3 +-
 .../commands/add_new_model_like.py            |    3 +-
 src/transformers/commands/env.py              |    2 +-
 src/transformers/commands/train.py            |    3 +-
 src/transformers/configuration_utils.py       |    4 +-
 src/transformers/convert_graph_to_onnx.py     |    2 +-
 .../convert_pytorch_checkpoint_to_tf2.py      |    3 +-
 src/transformers/convert_slow_tokenizer.py    |    2 +-
 src/transformers/data/data_collator.py        |    2 +-
 src/transformers/data/metrics/__init__.py     |    2 +-
 src/transformers/data/processors/glue.py      |    3 +-
 src/transformers/data/processors/squad.py     |    3 +-
 src/transformers/data/processors/utils.py     |    3 +-
 .../data/test_generation_utils.py             |    2 +-
 src/transformers/debug_utils.py               |    3 +-
 src/transformers/deepspeed.py                 |    3 +-
 src/transformers/dependency_versions_check.py |    2 +-
 src/transformers/dynamic_module_utils.py      |   10 +-
 .../feature_extraction_sequence_utils.py      |   12 +-
 src/transformers/feature_extraction_utils.py  |    8 +-
 src/transformers/file_utils.py                | 3160 +----------------
 src/transformers/generation_beam_search.py    |    2 +-
 .../generation_flax_logits_process.py         |    2 +-
 src/transformers/generation_flax_utils.py     |    3 +-
 src/transformers/generation_logits_process.py |    2 +-
 .../generation_stopping_criteria.py           |    2 +-
 .../generation_tf_logits_process.py           |    2 +-
 src/transformers/generation_tf_utils.py       |    3 +-
 src/transformers/generation_utils.py          |    3 +-
 src/transformers/image_utils.py               |    3 +-
 src/transformers/integrations.py              |    5 +-
 src/transformers/keras_callbacks.py           |    2 +-
 src/transformers/modelcard.py                 |   30 +-
 src/transformers/modeling_flax_outputs.py     |    2 +-
 src/transformers/modeling_flax_utils.py       |    8 +-
 src/transformers/modeling_outputs.py          |    2 +-
 src/transformers/modeling_tf_outputs.py       |    2 +-
 src/transformers/modeling_tf_pytorch_utils.py |    3 +-
 src/transformers/modeling_tf_utils.py         |   12 +-
 src/transformers/modeling_utils.py            |    6 +-
 src/transformers/models/albert/__init__.py    |    2 +-
 .../models/albert/modeling_albert.py          |   16 +-
 .../models/albert/modeling_flax_albert.py     |    3 +-
 .../models/albert/modeling_tf_albert.py       |   18 +-
 .../models/albert/tokenization_albert_fast.py |    3 +-
 src/transformers/models/auto/__init__.py      |    2 +-
 src/transformers/models/auto/auto_factory.py  |    3 +-
 .../models/auto/configuration_auto.py         |    3 +-
 .../models/auto/feature_extraction_auto.py    |    3 +-
 .../models/auto/processing_auto.py            |    3 +-
 .../models/auto/tokenization_auto.py          |    3 +-
 src/transformers/models/bart/__init__.py      |    2 +-
 .../models/bart/configuration_bart.py         |    3 +-
 src/transformers/models/bart/modeling_bart.py |   16 +-
 .../models/bart/modeling_flax_bart.py         |    3 +-
 .../models/bart/modeling_tf_bart.py           |   16 +-
 src/transformers/models/barthez/__init__.py   |    2 +-
 .../barthez/tokenization_barthez_fast.py      |    3 +-
 src/transformers/models/bartpho/__init__.py   |    2 +-
 src/transformers/models/beit/__init__.py      |    2 +-
 .../models/beit/feature_extraction_beit.py    |    3 +-
 src/transformers/models/beit/modeling_beit.py |   14 +-
 .../models/beit/modeling_flax_beit.py         |    2 +-
 src/transformers/models/bert/__init__.py      |    2 +-
 src/transformers/models/bert/modeling_bert.py |   16 +-
 .../models/bert/modeling_flax_bert.py         |    3 +-
 .../models/bert/modeling_tf_bert.py           |   20 +-
 .../models/bert_generation/__init__.py        |    2 +-
 .../modeling_bert_generation.py               |    8 +-
 .../models/bert_japanese/__init__.py          |    2 +-
 src/transformers/models/bertweet/__init__.py  |    2 +-
 src/transformers/models/big_bird/__init__.py  |    2 +-
 .../models/big_bird/modeling_big_bird.py      |   16 +-
 .../models/big_bird/modeling_flax_big_bird.py |    3 +-
 .../big_bird/tokenization_big_bird_fast.py    |    3 +-
 .../models/bigbird_pegasus/__init__.py        |    2 +-
 .../modeling_bigbird_pegasus.py               |   16 +-
 .../models/blenderbot/__init__.py             |    2 +-
 .../models/blenderbot/modeling_blenderbot.py  |   14 +-
 .../blenderbot/modeling_flax_blenderbot.py    |    3 +-
 .../blenderbot/modeling_tf_blenderbot.py      |   16 +-
 .../models/blenderbot_small/__init__.py       |    2 +-
 .../modeling_blenderbot_small.py              |   14 +-
 .../modeling_flax_blenderbot_small.py         |    3 +-
 .../modeling_tf_blenderbot_small.py           |   16 +-
 src/transformers/models/byt5/__init__.py      |    2 +-
 src/transformers/models/camembert/__init__.py |    2 +-
 .../models/camembert/modeling_camembert.py    |    3 +-
 .../models/camembert/modeling_tf_camembert.py |    3 +-
 .../camembert/tokenization_camembert_fast.py  |    3 +-
 src/transformers/models/canine/__init__.py    |    2 +-
 .../models/canine/modeling_canine.py          |    3 +-
 src/transformers/models/clip/__init__.py      |    2 +-
 .../models/clip/feature_extraction_clip.py    |    3 +-
 src/transformers/models/clip/modeling_clip.py |    8 +-
 .../models/clip/modeling_flax_clip.py         |    3 +-
 .../models/clip/modeling_tf_clip.py           |   14 +-
 src/transformers/models/convbert/__init__.py  |    2 +-
 .../models/convbert/modeling_convbert.py      |    3 +-
 .../models/convbert/modeling_tf_convbert.py   |   14 +-
 src/transformers/models/convnext/__init__.py  |    2 +-
 .../convnext/feature_extraction_convnext.py   |    3 +-
 .../models/convnext/modeling_convnext.py      |    6 +-
 .../models/convnext/modeling_tf_convnext.py   |    3 +-
 src/transformers/models/cpm/__init__.py       |    2 +-
 src/transformers/models/ctrl/__init__.py      |    2 +-
 src/transformers/models/ctrl/modeling_ctrl.py |    3 +-
 .../models/ctrl/modeling_tf_ctrl.py           |    3 +-
 src/transformers/models/data2vec/__init__.py  |    2 +-
 .../data2vec/modeling_data2vec_audio.py       |   10 +-
 .../models/data2vec/modeling_data2vec_text.py |   14 +-
 src/transformers/models/deberta/__init__.py   |    2 +-
 .../models/deberta/modeling_deberta.py        |    3 +-
 .../models/deberta/modeling_tf_deberta.py     |    3 +-
 .../models/deberta_v2/__init__.py             |    2 +-
 .../models/deberta_v2/modeling_deberta_v2.py  |    3 +-
 .../deberta_v2/modeling_tf_deberta_v2.py      |    3 +-
 src/transformers/models/deit/__init__.py      |    2 +-
 .../models/deit/feature_extraction_deit.py    |    3 +-
 src/transformers/models/deit/modeling_deit.py |    8 +-
 src/transformers/models/detr/__init__.py      |    2 +-
 .../models/detr/feature_extraction_detr.py    |    3 +-
 src/transformers/models/detr/modeling_detr.py |   10 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |    2 +-
 .../models/distilbert/__init__.py             |    2 +-
 .../models/distilbert/modeling_distilbert.py  |   14 +-
 .../distilbert/modeling_flax_distilbert.py    |    3 +-
 .../distilbert/modeling_tf_distilbert.py      |   14 +-
 src/transformers/models/dpr/__init__.py       |    2 +-
 src/transformers/models/dpr/modeling_dpr.py   |    8 +-
 .../models/dpr/modeling_tf_dpr.py             |    8 +-
 .../models/dpr/tokenization_dpr.py            |    3 +-
 .../models/dpr/tokenization_dpr_fast.py       |    3 +-
 src/transformers/models/electra/__init__.py   |    2 +-
 .../models/electra/modeling_electra.py        |   16 +-
 .../models/electra/modeling_flax_electra.py   |    3 +-
 .../models/electra/modeling_tf_electra.py     |   20 +-
 .../models/encoder_decoder/__init__.py        |    2 +-
 .../modeling_encoder_decoder.py               |    3 +-
 .../modeling_flax_encoder_decoder.py          |    3 +-
 .../modeling_tf_encoder_decoder.py            |   10 +-
 src/transformers/models/flaubert/__init__.py  |    2 +-
 .../models/flaubert/modeling_flaubert.py      |    3 +-
 .../models/flaubert/modeling_tf_flaubert.py   |   14 +-
 src/transformers/models/fnet/__init__.py      |    2 +-
 src/transformers/models/fnet/modeling_fnet.py |   16 +-
 .../models/fnet/tokenization_fnet_fast.py     |    3 +-
 src/transformers/models/fsmt/__init__.py      |    2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |    3 +-
 src/transformers/models/fsmt/modeling_fsmt.py |   16 +-
 src/transformers/models/funnel/__init__.py    |    2 +-
 .../models/funnel/modeling_funnel.py          |   16 +-
 .../models/funnel/modeling_tf_funnel.py       |   18 +-
 src/transformers/models/glpn/__init__.py      |    2 +-
 .../models/glpn/feature_extraction_glpn.py    |    3 +-
 src/transformers/models/glpn/modeling_glpn.py |    8 +-
 src/transformers/models/gpt2/__init__.py      |    2 +-
 ..._gpt2_original_tf_checkpoint_to_pytorch.py |    3 +-
 .../models/gpt2/modeling_flax_gpt2.py         |    3 +-
 src/transformers/models/gpt2/modeling_gpt2.py |   16 +-
 .../models/gpt2/modeling_tf_gpt2.py           |   18 +-
 src/transformers/models/gpt_neo/__init__.py   |    2 +-
 .../models/gpt_neo/modeling_flax_gpt_neo.py   |    3 +-
 .../models/gpt_neo/modeling_gpt_neo.py        |    3 +-
 src/transformers/models/gptj/__init__.py      |    2 +-
 .../models/gptj/modeling_flax_gptj.py         |    3 +-
 src/transformers/models/gptj/modeling_gptj.py |    3 +-
 src/transformers/models/herbert/__init__.py   |    2 +-
 src/transformers/models/hubert/__init__.py    |    2 +-
 .../models/hubert/modeling_hubert.py          |   14 +-
 .../models/hubert/modeling_tf_hubert.py       |   14 +-
 src/transformers/models/ibert/__init__.py     |    2 +-
 .../models/ibert/modeling_ibert.py            |    3 +-
 src/transformers/models/imagegpt/__init__.py  |    2 +-
 ...onvert_imagegpt_original_tf2_to_pytorch.py |    3 +-
 .../imagegpt/feature_extraction_imagegpt.py   |    3 +-
 .../models/imagegpt/modeling_imagegpt.py      |    3 +-
 src/transformers/models/layoutlm/__init__.py  |    2 +-
 .../models/layoutlm/modeling_layoutlm.py      |    3 +-
 .../models/layoutlm/modeling_tf_layoutlm.py   |    3 +-
 .../models/layoutlmv2/__init__.py             |    2 +-
 .../layoutlmv2/configuration_layoutlmv2.py    |    3 +-
 .../feature_extraction_layoutlmv2.py          |    3 +-
 .../models/layoutlmv2/modeling_layoutlmv2.py  |   16 +-
 .../layoutlmv2/processing_layoutlmv2.py       |    2 +-
 .../layoutlmv2/tokenization_layoutlmv2.py     |    3 +-
 .../tokenization_layoutlmv2_fast.py           |    3 +-
 src/transformers/models/layoutxlm/__init__.py |    2 +-
 .../models/layoutxlm/processing_layoutxlm.py  |    2 +-
 .../layoutxlm/tokenization_layoutxlm.py       |    3 +-
 .../layoutxlm/tokenization_layoutxlm_fast.py  |    3 +-
 src/transformers/models/led/__init__.py       |    2 +-
 src/transformers/models/led/modeling_led.py   |   18 +-
 .../models/led/modeling_tf_led.py             |   16 +-
 .../models/led/tokenization_led.py            |    3 +-
 .../models/led/tokenization_led_fast.py       |    3 +-
 .../models/longformer/__init__.py             |    2 +-
 .../models/longformer/modeling_longformer.py  |   16 +-
 .../longformer/modeling_tf_longformer.py      |   16 +-
 src/transformers/models/luke/__init__.py      |    2 +-
 src/transformers/models/luke/modeling_luke.py |    8 +-
 .../models/luke/tokenization_luke.py          |    3 +-
 src/transformers/models/lxmert/__init__.py    |    2 +-
 .../models/lxmert/modeling_lxmert.py          |    6 +-
 .../models/lxmert/modeling_tf_lxmert.py       |    6 +-
 src/transformers/models/m2m_100/__init__.py   |    2 +-
 .../models/m2m_100/configuration_m2m_100.py   |    3 +-
 .../models/m2m_100/modeling_m2m_100.py        |   16 +-
 src/transformers/models/marian/__init__.py    |    2 +-
 .../models/marian/configuration_marian.py     |    3 +-
 .../models/marian/modeling_flax_marian.py     |    3 +-
 .../models/marian/modeling_marian.py          |   14 +-
 .../models/marian/modeling_tf_marian.py       |   16 +-
 .../models/maskformer/__init__.py             |    2 +-
 .../feature_extraction_maskformer.py          |    3 +-
 .../models/maskformer/modeling_maskformer.py  |    6 +-
 src/transformers/models/mbart/__init__.py     |    2 +-
 .../models/mbart/configuration_mbart.py       |    3 +-
 .../models/mbart/modeling_flax_mbart.py       |    3 +-
 .../models/mbart/modeling_mbart.py            |   16 +-
 .../models/mbart/modeling_tf_mbart.py         |   16 +-
 .../models/mbart/tokenization_mbart_fast.py   |    3 +-
 src/transformers/models/mbart50/__init__.py   |    2 +-
 .../mbart50/tokenization_mbart50_fast.py      |    3 +-
 .../models/megatron_bert/__init__.py          |    2 +-
 .../megatron_bert/modeling_megatron_bert.py   |   16 +-
 src/transformers/models/mluke/__init__.py     |    2 +-
 .../models/mluke/tokenization_mluke.py        |    3 +-
 src/transformers/models/mmbt/__init__.py      |    2 +-
 src/transformers/models/mmbt/modeling_mmbt.py |    3 +-
 .../models/mobilebert/__init__.py             |    2 +-
 .../models/mobilebert/modeling_mobilebert.py  |   16 +-
 .../mobilebert/modeling_tf_mobilebert.py      |   18 +-
 src/transformers/models/mpnet/__init__.py     |    2 +-
 .../models/mpnet/modeling_mpnet.py            |    3 +-
 .../models/mpnet/modeling_tf_mpnet.py         |   14 +-
 src/transformers/models/mt5/__init__.py       |    2 +-
 .../models/nystromformer/__init__.py          |    2 +-
 .../nystromformer/modeling_nystromformer.py   |    3 +-
 src/transformers/models/openai/__init__.py    |    2 +-
 ...penai_original_tf_checkpoint_to_pytorch.py |    3 +-
 .../models/openai/modeling_openai.py          |   16 +-
 .../models/openai/modeling_tf_openai.py       |   16 +-
 src/transformers/models/pegasus/__init__.py   |    2 +-
 .../models/pegasus/modeling_flax_pegasus.py   |    3 +-
 .../models/pegasus/modeling_pegasus.py        |   14 +-
 .../models/pegasus/modeling_tf_pegasus.py     |   16 +-
 .../pegasus/tokenization_pegasus_fast.py      |    3 +-
 src/transformers/models/perceiver/__init__.py |    2 +-
 .../perceiver/feature_extraction_perceiver.py |    3 +-
 .../models/perceiver/modeling_perceiver.py    |   14 +-
 src/transformers/models/phobert/__init__.py   |    2 +-
 src/transformers/models/plbart/__init__.py    |    2 +-
 .../models/plbart/modeling_plbart.py          |   16 +-
 .../models/poolformer/__init__.py             |    2 +-
 .../feature_extraction_poolformer.py          |    3 +-
 .../models/poolformer/modeling_poolformer.py  |    6 +-
 .../models/prophetnet/__init__.py             |    2 +-
 .../models/prophetnet/modeling_prophetnet.py  |    8 +-
 src/transformers/models/qdqbert/__init__.py   |    2 +-
 .../models/qdqbert/modeling_qdqbert.py        |   18 +-
 src/transformers/models/rag/__init__.py       |    2 +-
 .../models/rag/configuration_rag.py           |    2 +-
 src/transformers/models/rag/modeling_rag.py   |    3 +-
 .../models/rag/modeling_tf_rag.py             |    3 +-
 src/transformers/models/rag/retrieval_rag.py  |    3 +-
 src/transformers/models/realm/__init__.py     |    2 +-
 .../models/realm/modeling_realm.py            |    3 +-
 .../models/realm/tokenization_realm.py        |    3 +-
 .../models/realm/tokenization_realm_fast.py   |    3 +-
 src/transformers/models/reformer/__init__.py  |    2 +-
 .../models/reformer/modeling_reformer.py      |    8 +-
 .../reformer/tokenization_reformer_fast.py    |    3 +-
 src/transformers/models/rembert/__init__.py   |    2 +-
 .../models/rembert/modeling_rembert.py        |   14 +-
 .../models/rembert/modeling_tf_rembert.py     |   16 +-
 .../rembert/tokenization_rembert_fast.py      |    3 +-
 src/transformers/models/resnet/__init__.py    |    2 +-
 .../models/resnet/modeling_resnet.py          |    3 +-
 src/transformers/models/retribert/__init__.py |    2 +-
 .../models/retribert/modeling_retribert.py    |    3 +-
 src/transformers/models/roberta/__init__.py   |    2 +-
 .../models/roberta/modeling_flax_roberta.py   |    3 +-
 .../models/roberta/modeling_roberta.py        |   14 +-
 .../models/roberta/modeling_tf_roberta.py     |   16 +-
 src/transformers/models/roformer/__init__.py  |    2 +-
 .../models/roformer/modeling_flax_roformer.py |    3 +-
 .../models/roformer/modeling_roformer.py      |   14 +-
 .../models/roformer/modeling_tf_roformer.py   |   14 +-
 src/transformers/models/segformer/__init__.py |    2 +-
 .../segformer/feature_extraction_segformer.py |    3 +-
 .../models/segformer/modeling_segformer.py    |    8 +-
 src/transformers/models/sew/__init__.py       |    2 +-
 src/transformers/models/sew/modeling_sew.py   |    3 +-
 src/transformers/models/sew_d/__init__.py     |    2 +-
 .../models/sew_d/modeling_sew_d.py            |    3 +-
 .../models/speech_encoder_decoder/__init__.py |    2 +-
 .../modeling_flax_speech_encoder_decoder.py   |    3 +-
 .../modeling_speech_encoder_decoder.py        |    3 +-
 .../models/speech_to_text/__init__.py         |    8 +-
 .../feature_extraction_speech_to_text.py      |    3 +-
 .../speech_to_text/modeling_speech_to_text.py |    3 +-
 .../modeling_tf_speech_to_text.py             |   14 +-
 .../models/speech_to_text_2/__init__.py       |    2 +-
 .../modeling_speech_to_text_2.py              |    3 +-
 src/transformers/models/splinter/__init__.py  |    2 +-
 .../models/splinter/modeling_splinter.py      |    3 +-
 .../models/squeezebert/__init__.py            |    2 +-
 .../squeezebert/modeling_squeezebert.py       |    3 +-
 src/transformers/models/swin/__init__.py      |    2 +-
 src/transformers/models/swin/modeling_swin.py |    6 +-
 src/transformers/models/t5/__init__.py        |    2 +-
 .../models/t5/modeling_flax_t5.py             |    3 +-
 src/transformers/models/t5/modeling_t5.py     |   18 +-
 src/transformers/models/t5/modeling_tf_t5.py  |   16 +-
 .../models/t5/tokenization_t5_fast.py         |    3 +-
 src/transformers/models/tapas/__init__.py     |    2 +-
 .../models/tapas/modeling_tapas.py            |   18 +-
 .../models/tapas/modeling_tf_tapas.py         |   18 +-
 .../models/tapas/tokenization_tapas.py        |    3 +-
 .../models/transfo_xl/__init__.py             |    2 +-
 ...fo_xl_original_tf_checkpoint_to_pytorch.py |    3 +-
 .../transfo_xl/modeling_tf_transfo_xl.py      |   14 +-
 .../models/transfo_xl/modeling_transfo_xl.py  |    6 +-
 .../transfo_xl/tokenization_transfo_xl.py     |    3 +-
 src/transformers/models/trocr/__init__.py     |    2 +-
 .../models/trocr/modeling_trocr.py            |    3 +-
 src/transformers/models/unispeech/__init__.py |    2 +-
 .../models/unispeech/modeling_unispeech.py    |   10 +-
 .../models/unispeech_sat/__init__.py          |    2 +-
 .../unispeech_sat/modeling_unispeech_sat.py   |   10 +-
 src/transformers/models/van/__init__.py       |    2 +-
 src/transformers/models/van/modeling_van.py   |    6 +-
 src/transformers/models/vilt/__init__.py      |    2 +-
 .../models/vilt/feature_extraction_vilt.py    |    3 +-
 src/transformers/models/vilt/modeling_vilt.py |    3 +-
 .../models/vilt/processing_vilt.py            |    2 +-
 .../models/vision_encoder_decoder/__init__.py |    2 +-
 .../modeling_flax_vision_encoder_decoder.py   |    3 +-
 .../modeling_tf_vision_encoder_decoder.py     |   10 +-
 .../modeling_vision_encoder_decoder.py        |    3 +-
 .../vision_text_dual_encoder/__init__.py      |    2 +-
 .../modeling_flax_vision_text_dual_encoder.py |    3 +-
 .../modeling_vision_text_dual_encoder.py      |    3 +-
 .../models/visual_bert/__init__.py            |    2 +-
 .../visual_bert/modeling_visual_bert.py       |   14 +-
 src/transformers/models/vit/__init__.py       |    2 +-
 .../models/vit/feature_extraction_vit.py      |    3 +-
 .../models/vit/modeling_flax_vit.py           |    2 +-
 .../models/vit/modeling_tf_vit.py             |    3 +-
 src/transformers/models/vit/modeling_vit.py   |    8 +-
 src/transformers/models/vit_mae/__init__.py   |    2 +-
 .../models/vit_mae/modeling_vit_mae.py        |    8 +-
 src/transformers/models/wav2vec2/__init__.py  |    2 +-
 .../wav2vec2/feature_extraction_wav2vec2.py   |    3 +-
 .../models/wav2vec2/modeling_flax_wav2vec2.py |    3 +-
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |   14 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |   16 +-
 .../models/wav2vec2/tokenization_wav2vec2.py  |    8 +-
 .../models/wav2vec2_phoneme/__init__.py       |    2 +-
 .../tokenization_wav2vec2_phoneme.py          |    8 +-
 .../models/wav2vec2_with_lm/__init__.py       |    2 +-
 .../processing_wav2vec2_with_lm.py            |    2 +-
 src/transformers/models/wavlm/__init__.py     |    2 +-
 .../models/wavlm/modeling_wavlm.py            |   10 +-
 src/transformers/models/xglm/__init__.py      |    2 +-
 .../models/xglm/modeling_flax_xglm.py         |    3 +-
 src/transformers/models/xglm/modeling_xglm.py |    3 +-
 .../models/xglm/tokenization_xglm_fast.py     |    3 +-
 src/transformers/models/xlm/__init__.py       |    2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |    3 +-
 .../models/xlm/modeling_tf_xlm.py             |   16 +-
 src/transformers/models/xlm/modeling_xlm.py   |   16 +-
 .../models/xlm_prophetnet/__init__.py         |    2 +-
 .../models/xlm_roberta/__init__.py            |    2 +-
 .../xlm_roberta/modeling_flax_xlm_roberta.py  |    3 +-
 .../xlm_roberta/modeling_tf_xlm_roberta.py    |    3 +-
 .../xlm_roberta/modeling_xlm_roberta.py       |    3 +-
 .../tokenization_xlm_roberta_fast.py          |    3 +-
 .../models/xlm_roberta_xl/__init__.py         |    2 +-
 .../xlm_roberta_xl/modeling_xlm_roberta_xl.py |   14 +-
 src/transformers/models/xlnet/__init__.py     |    2 +-
 ...xlnet_original_tf_checkpoint_to_pytorch.py |    3 +-
 .../models/xlnet/modeling_tf_xlnet.py         |   18 +-
 .../models/xlnet/modeling_xlnet.py            |   16 +-
 .../models/xlnet/tokenization_xlnet.py        |    3 +-
 .../models/xlnet/tokenization_xlnet_fast.py   |    3 +-
 src/transformers/models/yoso/__init__.py      |    2 +-
 src/transformers/models/yoso/modeling_yoso.py |    3 +-
 src/transformers/onnx/__init__.py             |    2 +-
 src/transformers/onnx/config.py               |    5 +-
 src/transformers/onnx/convert.py              |   11 +-
 src/transformers/pipelines/__init__.py        |    3 +-
 .../pipelines/audio_classification.py         |    3 +-
 .../pipelines/automatic_speech_recognition.py |    3 +-
 src/transformers/pipelines/base.py            |    3 +-
 src/transformers/pipelines/conversational.py  |    3 +-
 src/transformers/pipelines/fill_mask.py       |    3 +-
 .../pipelines/image_classification.py         |    4 +-
 .../pipelines/image_segmentation.py           |    3 +-
 .../pipelines/object_detection.py             |    3 +-
 .../pipelines/question_answering.py           |    3 +-
 .../pipelines/table_question_answering.py     |    2 +-
 .../pipelines/text2text_generation.py         |    3 +-
 .../pipelines/text_classification.py          |    2 +-
 src/transformers/pipelines/text_generation.py |    2 +-
 .../pipelines/token_classification.py         |    2 +-
 .../pipelines/zero_shot_classification.py     |    3 +-
 .../zero_shot_image_classification.py         |    4 +-
 src/transformers/processing_utils.py          |    3 +-
 .../sagemaker/training_args_sm.py             |    3 +-
 src/transformers/testing_utils.py             |    4 +-
 src/transformers/tokenization_utils.py        |    3 +-
 src/transformers/tokenization_utils_base.py   |   10 +-
 src/transformers/tokenization_utils_fast.py   |    3 +-
 src/transformers/trainer.py                   |   24 +-
 src/transformers/trainer_pt_utils.py          |    6 +-
 src/transformers/trainer_tf.py                |    2 +-
 src/transformers/trainer_utils.py             |    2 +-
 src/transformers/training_args.py             |    6 +-
 src/transformers/training_args_seq2seq.py     |    2 +-
 src/transformers/training_args_tf.py          |    3 +-
 src/transformers/utils/__init__.py            |  134 +
 src/transformers/utils/doc.py                 |  936 +++++
 .../utils/dummy_detectron2_objects.py         |    2 +-
 src/transformers/utils/dummy_flax_objects.py  |    2 +-
 src/transformers/utils/dummy_pt_objects.py    |    2 +-
 .../utils/dummy_scatter_objects.py            |    2 +-
 .../dummy_sentencepiece_and_speech_objects.py |    2 +-
 ...my_sentencepiece_and_tokenizers_objects.py |    2 +-
 .../utils/dummy_sentencepiece_objects.py      |    2 +-
 .../utils/dummy_speech_objects.py             |    2 +-
 src/transformers/utils/dummy_tf_objects.py    |    2 +-
 .../utils/dummy_timm_and_vision_objects.py    |    2 +-
 src/transformers/utils/dummy_timm_objects.py  |    2 +-
 .../utils/dummy_tokenizers_objects.py         |    2 +-
 .../utils/dummy_vision_objects.py             |    2 +-
 src/transformers/utils/fx.py                  |    3 +-
 src/transformers/utils/generic.py             |  291 ++
 src/transformers/utils/hub.py                 | 1023 ++++++
 src/transformers/utils/import_utils.py        |  864 +++++
 .../__init__.py                               |    8 +-
 ...ax_{{cookiecutter.lowercase_modelname}}.py |    4 +-
 ...tf_{{cookiecutter.lowercase_modelname}}.py |    4 +-
 ...ng_{{cookiecutter.lowercase_modelname}}.py |    4 +-
 ...ng_{{cookiecutter.lowercase_modelname}}.py |    2 +-
 tests/auto/test_processor_auto.py             |    2 +-
 tests/bart/test_modeling_bart.py              |    2 +-
 tests/bart/test_modeling_tf_bart.py           |    2 +-
 tests/bart/test_tokenization_bart.py          |    2 +-
 tests/beit/test_feature_extraction_beit.py    |    2 +-
 tests/beit/test_modeling_beit.py              |    2 +-
 tests/beit/test_modeling_flax_beit.py         |    2 +-
 .../test_tokenization_bert_generation.py      |    2 +-
 tests/big_bird/test_tokenization_big_bird.py  |    2 +-
 tests/blenderbot/test_modeling_blenderbot.py  |    2 +-
 .../blenderbot/test_modeling_tf_blenderbot.py |    2 +-
 .../test_tokenization_blenderbot.py           |    2 +-
 .../test_modeling_blenderbot_small.py         |    2 +-
 .../test_modeling_tf_blenderbot_small.py      |    2 +-
 tests/byt5/test_tokenization_byt5.py          |    2 +-
 .../camembert/test_tokenization_camembert.py  |    2 +-
 tests/canine/test_tokenization_canine.py      |    2 +-
 tests/clip/test_feature_extraction_clip.py    |    2 +-
 tests/clip/test_modeling_clip.py              |    2 +-
 tests/clip/test_modeling_tf_clip.py           |    2 +-
 tests/clip/test_processor_clip.py             |    2 +-
 .../test_feature_extraction_convnext.py       |    2 +-
 tests/convnext/test_modeling_convnext.py      |    2 +-
 tests/convnext/test_modeling_tf_convnext.py   |    2 +-
 tests/deepspeed/test_deepspeed.py             |    2 +-
 tests/deit/test_feature_extraction_deit.py    |    2 +-
 tests/deit/test_modeling_deit.py              |    2 +-
 tests/detr/test_feature_extraction_detr.py    |    2 +-
 tests/detr/test_modeling_detr.py              |    2 +-
 tests/extended/test_trainer_ext.py            |    2 +-
 tests/fsmt/test_modeling_fsmt.py              |    2 +-
 tests/fsmt/test_tokenization_fsmt.py          |    2 +-
 tests/glpn/test_feature_extraction_glpn.py    |    2 +-
 tests/gpt_neo/test_modeling_gpt_neo.py        |    2 +-
 .../test_feature_extraction_imagegpt.py       |    2 +-
 tests/imagegpt/test_modeling_imagegpt.py      |    2 +-
 .../test_feature_extraction_layoutlmv2.py     |    2 +-
 tests/layoutlmv2/test_modeling_layoutlmv2.py  |    2 +-
 tests/layoutlmv2/test_processor_layoutlmv2.py |    2 +-
 tests/layoutxlm/test_processor_layoutxlm.py   |    2 +-
 tests/led/test_modeling_led.py                |    2 +-
 tests/m2m_100/test_modeling_m2m_100.py        |    2 +-
 tests/m2m_100/test_tokenization_m2m_100.py    |    2 +-
 tests/marian/test_modeling_flax_marian.py     |    2 +-
 tests/marian/test_modeling_marian.py          |    2 +-
 tests/marian/test_modeling_tf_marian.py       |    2 +-
 tests/marian/test_tokenization_marian.py      |    2 +-
 .../test_feature_extraction_maskformer.py     |    2 +-
 tests/maskformer/test_modeling_maskformer.py  |    2 +-
 tests/mbart/test_modeling_flax_mbart.py       |    2 +-
 tests/mbart/test_modeling_mbart.py            |    2 +-
 tests/mbart/test_modeling_tf_mbart.py         |    2 +-
 tests/onnx/test_onnx_v2.py                    |    2 +-
 tests/pegasus/test_modeling_pegasus.py        |    2 +-
 tests/pegasus/test_modeling_tf_pegasus.py     |    2 +-
 tests/pegasus/test_tokenization_pegasus.py    |    2 +-
 tests/perceiver/test_modeling_perceiver.py    |    2 +-
 .../perceiver/test_tokenization_perceiver.py  |    2 +-
 tests/plbart/test_modeling_plbart.py          |    2 +-
 .../test_feature_extraction_poolformer.py     |    2 +-
 tests/rag/test_modeling_rag.py                |    2 +-
 tests/rag/test_modeling_tf_rag.py             |    2 +-
 tests/rag/test_tokenization_rag.py            |    2 +-
 tests/reformer/test_tokenization_reformer.py  |    2 +-
 tests/resnet/test_modeling_resnet.py          |    2 +-
 .../test_feature_extraction_segformer.py      |    2 +-
 .../test_modeling_speech_to_text.py           |    2 +-
 .../test_modeling_tf_speech_to_text.py        |    2 +-
 .../test_processor_speech_to_text.py          |    2 +-
 tests/swin/test_modeling_swin.py              |    2 +-
 tests/t5/test_modeling_t5.py                  |    2 +-
 tests/t5/test_modeling_tf_t5.py               |    2 +-
 tests/t5/test_tokenization_t5.py              |    2 +-
 tests/tapas/test_modeling_tapas.py            |    2 +-
 tests/tapas/test_modeling_tf_tapas.py         |    2 +-
 tests/test_feature_extraction_common.py       |    2 +-
 tests/test_modeling_common.py                 |    2 +-
 tests/trainer/test_trainer.py                 |    2 +-
 tests/trainer/test_trainer_seq2seq.py         |    2 +-
 tests/trainer/test_trainer_utils.py           |    2 +-
 tests/utils/test_add_new_model_like.py        |   16 +-
 tests/utils/test_file_utils.py                |    4 +-
 tests/utils/test_model_output.py              |    2 +-
 tests/van/test_modeling_van.py                |    2 +-
 tests/vilt/test_feature_extraction_vilt.py    |    2 +-
 tests/vilt/test_modeling_vilt.py              |    2 +-
 .../test_modeling_vision_encoder_decoder.py   |    2 +-
 ..._modeling_flax_vision_text_dual_encoder.py |    2 +-
 .../test_modeling_vision_text_dual_encoder.py |    2 +-
 ...test_processor_vision_text_dual_encoder.py |    2 +-
 tests/vit/test_feature_extraction_vit.py      |    2 +-
 tests/vit/test_modeling_tf_vit.py             |    2 +-
 tests/vit/test_modeling_vit.py                |    2 +-
 tests/vit_mae/test_modeling_vit_mae.py        |    2 +-
 tests/wav2vec2/test_modeling_tf_wav2vec2.py   |    2 +-
 tests/wav2vec2/test_processor_wav2vec2.py     |    2 +-
 .../test_processor_wav2vec2_with_lm.py        |    2 +-
 tests/xglm/test_tokenization_xglm.py          |    2 +-
 .../test_tokenization_xlm_prophetnet.py       |    2 +-
 .../test_tokenization_xlm_roberta.py          |    2 +-
 utils/check_dummies.py                        |    2 +-
 utils/check_repo.py                           |    2 +-
 583 files changed, 4670 insertions(+), 4514 deletions(-)
 create mode 100644 src/transformers/utils/doc.py
 create mode 100644 src/transformers/utils/generic.py
 create mode 100644 src/transformers/utils/hub.py
 create mode 100644 src/transformers/utils/import_utils.py

diff --git a/ISSUES.md b/ISSUES.md
index fa0c896100..5985dad3d7 100644
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -71,7 +71,7 @@ You are not required to read the following guidelines before opening an issue. H
      File "/transformers/src/transformers/__init__.py", line 34, in <module>
        from . import dependency_versions_check
      File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
-       from .file_utils import is_tokenizers_available
+       from .utils import is_tokenizers_available
      File "/transformers/src/transformers/file_utils.py", line 40, in <module>
        from tqdm.auto import tqdm
     ModuleNotFoundError: No module named 'tqdm.auto'
@@ -124,7 +124,7 @@ You are not required to read the following guidelines before opening an issue. H
      File "/transformers/src/transformers/__init__.py", line 34, in <module>
        from . import dependency_versions_check
      File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
-       from .file_utils import is_tokenizers_available
+       from .utils import is_tokenizers_available
      File "/transformers/src/transformers/file_utils.py", line 40, in <module>
        from tqdm.auto import tqdm
    ModuleNotFoundError: No module named 'tqdm.auto'
diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index c2772db03a..4e31da6b9e 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -52,7 +52,7 @@ from transformers import (
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.file_utils import get_full_repo_name, is_offline_mode
+from transformers.utils import get_full_repo_name, is_offline_mode
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 7746400d09..bbcbc8268b 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -57,8 +57,8 @@ from transformers import (
     is_tensorboard_available,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
 from transformers.testing_utils import CaptureLogger
+from transformers.utils import get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index 2a65b6f692..daa247ecaa 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -58,7 +58,7 @@ from transformers import (
     is_tensorboard_available,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
+from transformers.utils import get_full_repo_name
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index 83ef2dbc30..6fd5636ce1 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -56,8 +56,8 @@ from transformers import (
     is_tensorboard_available,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
+from transformers.utils import get_full_repo_name
 
 
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 24dd62e9d5..5d21e882e5 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -53,8 +53,7 @@ from transformers import (
     PreTrainedTokenizerFast,
     is_tensorboard_available,
 )
-from transformers.file_utils import get_full_repo_name
-from transformers.utils import check_min_version
+from transformers.utils import check_min_version, get_full_repo_name
 from utils_qa import postprocess_qa_predictions
 
 
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index 9bb43b89a4..cfcf2e63df 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -54,7 +54,7 @@ from transformers import (
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.file_utils import get_full_repo_name, is_offline_mode
+from transformers.utils import get_full_repo_name, is_offline_mode
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index e225c45574..d56d23d273 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -48,8 +48,7 @@ from transformers import (
     TrainingArguments,
     is_tensorboard_available,
 )
-from transformers.file_utils import get_full_repo_name
-from transformers.utils import check_min_version
+from transformers.utils import check_min_version, get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 8f6fb0c51e..abf1b8d0c1 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -47,8 +47,7 @@ from transformers import (
     HfArgumentParser,
     is_tensorboard_available,
 )
-from transformers.file_utils import get_full_repo_name
-from transformers.utils import check_min_version
+from transformers.utils import check_min_version, get_full_repo_name
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 5c6d00200a..7459d24c63 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -53,7 +53,7 @@ from transformers import (
     is_tensorboard_available,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
+from transformers.utils import get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/legacy/seq2seq/old_test_tatoeba_conversion.py b/examples/legacy/seq2seq/old_test_tatoeba_conversion.py
index b5b7e56f61..b9733daf85 100644
--- a/examples/legacy/seq2seq/old_test_tatoeba_conversion.py
+++ b/examples/legacy/seq2seq/old_test_tatoeba_conversion.py
@@ -16,9 +16,9 @@ import os
 import tempfile
 import unittest
 
-from transformers.file_utils import cached_property
 from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
 from transformers.testing_utils import slow
+from transformers.utils import cached_property
 
 
 @unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
diff --git a/examples/legacy/seq2seq/seq2seq_trainer.py b/examples/legacy/seq2seq/seq2seq_trainer.py
index 3ddc798b68..1c2d7924a4 100644
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@@ -19,7 +19,6 @@ from torch import nn
 from torch.utils.data import DistributedSampler, RandomSampler
 
 from transformers import PreTrainedModel, Trainer, logging
-from transformers.file_utils import is_torch_tpu_available
 from transformers.integrations import is_fairscale_available
 from transformers.models.fsmt.configuration_fsmt import FSMTConfig
 from transformers.optimization import (
@@ -34,6 +33,7 @@ from transformers.optimization import (
 )
 from transformers.trainer_pt_utils import get_tpu_sampler
 from transformers.training_args import ParallelMode
+from transformers.utils import is_torch_tpu_available
 
 
 if is_fairscale_available():
diff --git a/examples/legacy/seq2seq/utils.py b/examples/legacy/seq2seq/utils.py
index 2b4700e9f7..2e0586a269 100644
--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@@ -34,8 +34,8 @@ from torch.utils.data import Dataset, Sampler
 
 from sentence_splitter import add_newline_to_end_of_each_sentence
 from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
-from transformers.file_utils import cached_property
 from transformers.models.bart.modeling_bart import shift_tokens_right
+from transformers.utils import cached_property
 
 
 try:
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 9d8820a56d..f2f678dd8a 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -51,7 +51,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
+from transformers.utils import get_full_repo_name
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index adadb4d2fd..5d2ea0d4ea 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -51,7 +51,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
+from transformers.utils import get_full_repo_name
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index cbd7e90d88..a72ffde3f7 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -41,10 +41,9 @@ from transformers import (
     default_data_collator,
     set_seed,
 )
-from transformers.file_utils import PaddingStrategy
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version
+from transformers.utils import PaddingStrategy, check_min_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 7daad8f385..4c60e72f3c 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -50,7 +50,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import PaddingStrategy, get_full_repo_name
+from transformers.utils import PaddingStrategy, get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 61f5d12fbf..939724bd60 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -47,8 +47,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
-from transformers.utils import check_min_version
+from transformers.utils import check_min_version, get_full_repo_name
 from transformers.utils.versions import require_version
 from utils_qa import postprocess_qa_predictions_with_beam_search
 
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 336327abf3..2129ed9767 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -49,8 +49,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
-from transformers.utils import check_min_version
+from transformers.utils import check_min_version, get_full_repo_name
 from transformers.utils.versions import require_version
 from utils_qa import postprocess_qa_predictions
 
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index fdce3a22f4..51ac519118 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -42,8 +42,8 @@ from transformers import (
     is_wandb_available,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
 from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
+from transformers.utils import get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index a60868104a..8d8e355e4f 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -45,9 +45,8 @@ from transformers import (
     Seq2SeqTrainingArguments,
     set_seed,
 )
-from transformers.file_utils import is_offline_mode
 from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version
+from transformers.utils import check_min_version, is_offline_mode
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 2036d57e55..d9b8657d26 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -49,7 +49,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name, is_offline_mode
+from transformers.utils import get_full_repo_name, is_offline_mode
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/pytorch/test_pytorch_examples.py b/examples/pytorch/test_pytorch_examples.py
index 8039bebf2f..15dcfd011f 100644
--- a/examples/pytorch/test_pytorch_examples.py
+++ b/examples/pytorch/test_pytorch_examples.py
@@ -25,8 +25,8 @@ from unittest.mock import patch
 import torch
 
 from transformers import ViTMAEForPreTraining, Wav2Vec2ForPreTraining
-from transformers.file_utils import is_apex_available
 from transformers.testing_utils import CaptureLogger, TestCasePlus, get_gpu_count, slow, torch_device
+from transformers.utils import is_apex_available
 
 
 SRC_DIRS = [
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 43de2c4060..4dac18206b 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -40,7 +40,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
+from transformers.utils import get_full_repo_name
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index e292331ea4..90b01d6deb 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -48,7 +48,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
+from transformers.utils import get_full_repo_name
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 838957a8f1..34c1bcd0e7 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -50,7 +50,7 @@ from transformers import (
     get_scheduler,
     set_seed,
 )
-from transformers.file_utils import get_full_repo_name
+from transformers.utils import get_full_repo_name
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index 69c75b2123..c5ebc31068 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -43,9 +43,8 @@ from transformers import (
     create_optimizer,
     set_seed,
 )
-from transformers.file_utils import PaddingStrategy
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-from transformers.utils import check_min_version
+from transformers.utils import PaddingStrategy, check_min_version
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index cd134a2b47..39437da467 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -41,8 +41,7 @@ from transformers import (
     TFTrainingArguments,
     set_seed,
 )
-from transformers.file_utils import CONFIG_NAME, TF2_WEIGHTS_NAME
-from transformers.utils import check_min_version
+from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version
 from utils_qa import postprocess_qa_predictions
 
 
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index 4a754e62de..004f25157f 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -43,9 +43,8 @@ from transformers import (
     create_optimizer,
     set_seed,
 )
-from transformers.file_utils import is_offline_mode
 from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version
+from transformers.utils import check_min_version, is_offline_mode
 from transformers.utils.versions import require_version
 
 
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index f2693bb1b8..114caacdbf 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -37,7 +37,7 @@ from transformers import (
     TFTrainingArguments,
     set_seed,
 )
-from transformers.file_utils import CONFIG_NAME, TF2_WEIGHTS_NAME
+from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME
 
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"  # Reduce the amount of console output from TF
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c4f2dffe7c..956db3bfa3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -28,7 +28,7 @@ from typing import TYPE_CHECKING
 
 # Check the dependencies satisfy the minimal versions required.
 from . import dependency_versions_check
-from .file_utils import (
+from .utils import (
     _LazyModule,
     is_flax_available,
     is_scatter_available,
@@ -39,8 +39,8 @@ from .file_utils import (
     is_tokenizers_available,
     is_torch_available,
     is_vision_available,
+    logging,
 )
-from .utils import logging
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -94,39 +94,7 @@ _import_structure = {
     "dynamic_module_utils": [],
     "feature_extraction_sequence_utils": ["SequenceFeatureExtractor"],
     "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"],
-    "file_utils": [
-        "CONFIG_NAME",
-        "MODEL_CARD_NAME",
-        "PYTORCH_PRETRAINED_BERT_CACHE",
-        "PYTORCH_TRANSFORMERS_CACHE",
-        "SPIECE_UNDERLINE",
-        "TF2_WEIGHTS_NAME",
-        "TF_WEIGHTS_NAME",
-        "TRANSFORMERS_CACHE",
-        "WEIGHTS_NAME",
-        "TensorType",
-        "add_end_docstrings",
-        "add_start_docstrings",
-        "cached_path",
-        "is_apex_available",
-        "is_datasets_available",
-        "is_faiss_available",
-        "is_flax_available",
-        "is_phonemizer_available",
-        "is_psutil_available",
-        "is_py3nvml_available",
-        "is_pyctcdecode_available",
-        "is_scipy_available",
-        "is_sentencepiece_available",
-        "is_sklearn_available",
-        "is_speech_available",
-        "is_tf_available",
-        "is_timm_available",
-        "is_tokenizers_available",
-        "is_torch_available",
-        "is_torch_tpu_available",
-        "is_vision_available",
-    ],
+    "file_utils": [],
     "hf_argparser": ["HfArgumentParser"],
     "integrations": [
         "is_comet_available",
@@ -147,8 +115,8 @@ _import_structure = {
         "load_tf2_model_in_pytorch_model",
         "load_tf2_weights_in_pytorch_model",
     ],
-    # Models
     "models": [],
+    # Models
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.auto": [
         "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -396,7 +364,40 @@ _import_structure = {
     "training_args": ["TrainingArguments"],
     "training_args_seq2seq": ["Seq2SeqTrainingArguments"],
     "training_args_tf": ["TFTrainingArguments"],
-    "utils": ["logging"],
+    "utils": [
+        "CONFIG_NAME",
+        "MODEL_CARD_NAME",
+        "PYTORCH_PRETRAINED_BERT_CACHE",
+        "PYTORCH_TRANSFORMERS_CACHE",
+        "SPIECE_UNDERLINE",
+        "TF2_WEIGHTS_NAME",
+        "TF_WEIGHTS_NAME",
+        "TRANSFORMERS_CACHE",
+        "WEIGHTS_NAME",
+        "TensorType",
+        "add_end_docstrings",
+        "add_start_docstrings",
+        "cached_path",
+        "is_apex_available",
+        "is_datasets_available",
+        "is_faiss_available",
+        "is_flax_available",
+        "is_phonemizer_available",
+        "is_psutil_available",
+        "is_py3nvml_available",
+        "is_pyctcdecode_available",
+        "is_scipy_available",
+        "is_sentencepiece_available",
+        "is_sklearn_available",
+        "is_speech_available",
+        "is_tf_available",
+        "is_timm_available",
+        "is_tokenizers_available",
+        "is_torch_available",
+        "is_torch_tpu_available",
+        "is_vision_available",
+        "logging",
+    ],
 }
 
 # sentencepiece-backed objects
@@ -2432,41 +2433,6 @@ if TYPE_CHECKING:
 
     # Feature Extractor
     from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-
-    # Files and general utilities
-    from .file_utils import (
-        CONFIG_NAME,
-        MODEL_CARD_NAME,
-        PYTORCH_PRETRAINED_BERT_CACHE,
-        PYTORCH_TRANSFORMERS_CACHE,
-        SPIECE_UNDERLINE,
-        TF2_WEIGHTS_NAME,
-        TF_WEIGHTS_NAME,
-        TRANSFORMERS_CACHE,
-        WEIGHTS_NAME,
-        TensorType,
-        add_end_docstrings,
-        add_start_docstrings,
-        cached_path,
-        is_apex_available,
-        is_datasets_available,
-        is_faiss_available,
-        is_flax_available,
-        is_phonemizer_available,
-        is_psutil_available,
-        is_py3nvml_available,
-        is_pyctcdecode_available,
-        is_scipy_available,
-        is_sentencepiece_available,
-        is_sklearn_available,
-        is_speech_available,
-        is_tf_available,
-        is_timm_available,
-        is_tokenizers_available,
-        is_torch_available,
-        is_torch_tpu_available,
-        is_vision_available,
-    )
     from .hf_argparser import HfArgumentParser
 
     # Integrations
@@ -2714,7 +2680,42 @@ if TYPE_CHECKING:
     from .training_args import TrainingArguments
     from .training_args_seq2seq import Seq2SeqTrainingArguments
     from .training_args_tf import TFTrainingArguments
-    from .utils import logging
+
+    # Files and general utilities
+    from .utils import (
+        CONFIG_NAME,
+        MODEL_CARD_NAME,
+        PYTORCH_PRETRAINED_BERT_CACHE,
+        PYTORCH_TRANSFORMERS_CACHE,
+        SPIECE_UNDERLINE,
+        TF2_WEIGHTS_NAME,
+        TF_WEIGHTS_NAME,
+        TRANSFORMERS_CACHE,
+        WEIGHTS_NAME,
+        TensorType,
+        add_end_docstrings,
+        add_start_docstrings,
+        cached_path,
+        is_apex_available,
+        is_datasets_available,
+        is_faiss_available,
+        is_flax_available,
+        is_phonemizer_available,
+        is_psutil_available,
+        is_py3nvml_available,
+        is_pyctcdecode_available,
+        is_scipy_available,
+        is_sentencepiece_available,
+        is_sklearn_available,
+        is_speech_available,
+        is_tf_available,
+        is_timm_available,
+        is_tokenizers_available,
+        is_torch_available,
+        is_torch_tpu_available,
+        is_vision_available,
+        logging,
+    )
 
     if is_sentencepiece_available():
         from .models.albert import AlbertTokenizer
diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py
index a07e0299ef..8569c6e324 100644
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -22,9 +22,8 @@ import timeit
 from typing import Callable, Optional
 
 from ..configuration_utils import PretrainedConfig
-from ..file_utils import is_py3nvml_available, is_torch_available
 from ..models.auto.modeling_auto import MODEL_MAPPING, MODEL_WITH_LM_HEAD_MAPPING
-from ..utils import logging
+from ..utils import is_py3nvml_available, is_torch_available, logging
 from .benchmark_utils import (
     Benchmark,
     Memory,
diff --git a/src/transformers/benchmark/benchmark_args.py b/src/transformers/benchmark/benchmark_args.py
index 40bf47da83..dbdf9d8a36 100644
--- a/src/transformers/benchmark/benchmark_args.py
+++ b/src/transformers/benchmark/benchmark_args.py
@@ -17,8 +17,7 @@
 from dataclasses import dataclass, field
 from typing import Tuple
 
-from ..file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required
-from ..utils import logging
+from ..utils import cached_property, is_torch_available, is_torch_tpu_available, logging, torch_required
 from .benchmark_args_utils import BenchmarkArguments
 
 
diff --git a/src/transformers/benchmark/benchmark_args_tf.py b/src/transformers/benchmark/benchmark_args_tf.py
index 06217882ec..7ec5054cb3 100644
--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -17,8 +17,7 @@
 from dataclasses import dataclass, field
 from typing import Tuple
 
-from ..file_utils import cached_property, is_tf_available, tf_required
-from ..utils import logging
+from ..utils import cached_property, is_tf_available, logging, tf_required
 from .benchmark_args_utils import BenchmarkArguments
 
 
diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py
index b5b2c2109e..0eb0db64a8 100644
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -24,9 +24,8 @@ from functools import wraps
 from typing import Callable, Optional
 
 from ..configuration_utils import PretrainedConfig
-from ..file_utils import is_py3nvml_available, is_tf_available
 from ..models.auto.modeling_tf_auto import TF_MODEL_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING
-from ..utils import logging
+from ..utils import is_py3nvml_available, is_tf_available, logging
 from .benchmark_utils import (
     Benchmark,
     Memory,
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index dfddd41c52..7e738bb601 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -33,8 +33,7 @@ from typing import Callable, Iterable, List, NamedTuple, Optional, Union
 
 from .. import AutoConfig, PretrainedConfig
 from .. import __version__ as version
-from ..file_utils import is_psutil_available, is_py3nvml_available, is_tf_available, is_torch_available
-from ..utils import logging
+from ..utils import is_psutil_available, is_py3nvml_available, is_tf_available, is_torch_available, logging
 from .benchmark_args_utils import BenchmarkArguments
 
 
diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py
index c7eafd7d87..d8eece141b 100644
--- a/src/transformers/commands/add_new_model_like.py
+++ b/src/transformers/commands/add_new_model_like.py
@@ -25,8 +25,7 @@ from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, Union
 import transformers.models.auto as auto_module
 from transformers.models.auto.configuration_auto import model_type_to_module_name
 
-from ..file_utils import is_flax_available, is_tf_available, is_torch_available
-from ..utils import logging
+from ..utils import is_flax_available, is_tf_available, is_torch_available, logging
 from . import BaseTransformersCLICommand
 
 
diff --git a/src/transformers/commands/env.py b/src/transformers/commands/env.py
index 5eb681de08..fac0204f8c 100644
--- a/src/transformers/commands/env.py
+++ b/src/transformers/commands/env.py
@@ -18,7 +18,7 @@ from argparse import ArgumentParser
 import huggingface_hub
 
 from .. import __version__ as version
-from ..file_utils import is_flax_available, is_tf_available, is_torch_available
+from ..utils import is_flax_available, is_tf_available, is_torch_available
 from . import BaseTransformersCLICommand
 
 
diff --git a/src/transformers/commands/train.py b/src/transformers/commands/train.py
index 03c8547ed1..e0071608c0 100644
--- a/src/transformers/commands/train.py
+++ b/src/transformers/commands/train.py
@@ -16,9 +16,8 @@ import os
 from argparse import ArgumentParser, Namespace
 
 from ..data import SingleSentenceClassificationProcessor as Processor
-from ..file_utils import is_tf_available, is_torch_available
 from ..pipelines import TextClassificationPipeline
-from ..utils import logging
+from ..utils import is_tf_available, is_torch_available, logging
 from . import BaseTransformersCLICommand
 
 
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 924ddf04c9..76d01e0a9e 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -29,7 +29,7 @@ from requests import HTTPError
 
 from . import __version__
 from .dynamic_module_utils import custom_object_save
-from .file_utils import (
+from .utils import (
     CONFIG_NAME,
     EntryNotFoundError,
     PushToHubMixin,
@@ -41,8 +41,8 @@ from .file_utils import (
     is_offline_mode,
     is_remote_url,
     is_torch_available,
+    logging,
 )
-from .utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index bcc850b26a..2647cbd869 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -20,9 +20,9 @@ from typing import Dict, List, Optional, Tuple
 
 from packaging.version import Version, parse
 
-from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available
 from transformers.pipelines import Pipeline, pipeline
 from transformers.tokenization_utils import BatchEncoding
+from transformers.utils import ModelOutput, is_tf_available, is_torch_available
 
 
 # This is the minimal required version to
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index 4dc8daf283..e083a905d7 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -95,8 +95,7 @@ from . import (
     is_torch_available,
     load_pytorch_checkpoint_in_tf2_model,
 )
-from .file_utils import hf_bucket_url
-from .utils import logging
+from .utils import hf_bucket_url, logging
 
 
 if is_torch_available():
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index 92f25bd0c8..3428299b46 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -24,7 +24,7 @@ from typing import Dict, List, Tuple
 from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-from .file_utils import requires_backends
+from .utils import requires_backends
 
 
 class SentencePieceExtractor:
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 7a3424ac21..c23c8b221c 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -17,9 +17,9 @@ import warnings
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
 
-from ..file_utils import PaddingStrategy
 from ..models.bert import BertTokenizer, BertTokenizerFast
 from ..tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
+from ..utils import PaddingStrategy
 
 
 InputDataClass = NewType("InputDataClass", Any)
diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py
index 42a6227cd6..c9d3ae5477 100644
--- a/src/transformers/data/metrics/__init__.py
+++ b/src/transformers/data/metrics/__init__.py
@@ -16,7 +16,7 @@
 
 import warnings
 
-from ...file_utils import is_sklearn_available, requires_backends
+from ...utils import is_sklearn_available, requires_backends
 
 
 if is_sklearn_available():
diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py
index 4f4075f932..749f15cb0c 100644
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -21,9 +21,8 @@ from dataclasses import asdict
 from enum import Enum
 from typing import List, Optional, Union
 
-from ...file_utils import is_tf_available
 from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ...utils import is_tf_available, logging
 from .utils import DataProcessor, InputExample, InputFeatures
 
 
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index f5e710f5ed..bf8ef6aecf 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -20,10 +20,9 @@ from multiprocessing import Pool, cpu_count
 import numpy as np
 from tqdm import tqdm
 
-from ...file_utils import is_tf_available, is_torch_available
 from ...models.bert.tokenization_bert import whitespace_tokenize
 from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
-from ...utils import logging
+from ...utils import is_tf_available, is_torch_available, logging
 from .utils import DataProcessor
 
 
diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py
index 8bc39d0a73..b403894d4c 100644
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -20,8 +20,7 @@ import json
 from dataclasses import dataclass
 from typing import List, Optional, Union
 
-from ...file_utils import is_tf_available, is_torch_available
-from ...utils import logging
+from ...utils import is_tf_available, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/data/test_generation_utils.py b/src/transformers/data/test_generation_utils.py
index ae2f7ccc92..b08dd88026 100644
--- a/src/transformers/data/test_generation_utils.py
+++ b/src/transformers/data/test_generation_utils.py
@@ -17,8 +17,8 @@ import unittest
 
 import timeout_decorator
 
-from ..file_utils import cached_property, is_torch_available
 from ..testing_utils import require_torch
+from ..utils import cached_property, is_torch_available
 
 
 if is_torch_available():
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 39780226d2..da266ac571 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -14,8 +14,7 @@
 
 import collections
 
-from .file_utils import ExplicitEnum, is_torch_available
-from .utils import logging
+from .utils import ExplicitEnum, is_torch_available, logging
 
 
 if is_torch_available():
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 993cf5d399..8fc0d6a0b6 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -23,8 +23,7 @@ from copy import deepcopy
 from functools import partialmethod
 
 from .dependency_versions_check import dep_version_check
-from .file_utils import is_torch_available
-from .utils import logging
+from .utils import is_torch_available, logging
 
 
 if is_torch_available():
diff --git a/src/transformers/dependency_versions_check.py b/src/transformers/dependency_versions_check.py
index e6e676481d..bd3b65f724 100644
--- a/src/transformers/dependency_versions_check.py
+++ b/src/transformers/dependency_versions_check.py
@@ -33,7 +33,7 @@ for pkg in pkgs_to_check_at_runtime:
     if pkg in deps:
         if pkg == "tokenizers":
             # must be loaded here, or else tqdm check may fail
-            from .file_utils import is_tokenizers_available
+            from .utils import is_tokenizers_available
 
             if not is_tokenizers_available():
                 continue  # not required, check version only if installed
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index f995f42d03..7670c4b668 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -24,8 +24,14 @@ from typing import Dict, Optional, Union
 
 from huggingface_hub import HfFolder, model_info
 
-from .file_utils import HF_MODULES_CACHE, TRANSFORMERS_DYNAMIC_MODULE_NAME, cached_path, hf_bucket_url, is_offline_mode
-from .utils import logging
+from .utils import (
+    HF_MODULES_CACHE,
+    TRANSFORMERS_DYNAMIC_MODULE_NAME,
+    cached_path,
+    hf_bucket_url,
+    is_offline_mode,
+    logging,
+)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index ffd04eae59..289f6c558a 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -20,16 +20,8 @@ from typing import Dict, List, Optional, Union
 import numpy as np
 
 from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from .file_utils import (
-    PaddingStrategy,
-    TensorType,
-    _is_tensorflow,
-    _is_torch,
-    is_tf_available,
-    is_torch_available,
-    to_numpy,
-)
-from .utils import logging
+from .utils import PaddingStrategy, TensorType, is_tf_available, is_torch_available, logging, to_numpy
+from .utils.generic import _is_tensorflow, _is_torch
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 49b633a02b..4ce20671bf 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -27,16 +27,13 @@ import numpy as np
 from requests import HTTPError
 
 from .dynamic_module_utils import custom_object_save
-from .file_utils import (
+from .utils import (
     FEATURE_EXTRACTOR_NAME,
     EntryNotFoundError,
     PushToHubMixin,
     RepositoryNotFoundError,
     RevisionNotFoundError,
     TensorType,
-    _is_jax,
-    _is_numpy,
-    _is_torch_device,
     cached_path,
     copy_func,
     hf_bucket_url,
@@ -45,9 +42,10 @@ from .file_utils import (
     is_remote_url,
     is_tf_available,
     is_torch_available,
+    logging,
     torch_required,
 )
-from .utils import logging
+from .utils.generic import _is_jax, _is_numpy, _is_torch_device
 
 
 if TYPE_CHECKING:
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 3065cb875a..c71a9a9b85 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -1,4 +1,8 @@
-# Copyright 2020 The HuggingFace Team, the AllenNLP library authors. All rights reserved.
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,3042 +16,128 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Utilities for working with the local dataset cache. Parts of this file is adapted from the AllenNLP library at
-https://github.com/allenai/allennlp.
+File utilities: utilities related to download and cache models
+
+This module should not be update anymore and is only left for backward compatibility.
 """
-import copy
-import fnmatch
-import functools
-import importlib.util
-import io
-import json
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tarfile
-import tempfile
-import types
-import warnings
-from collections import OrderedDict, UserDict
-from contextlib import ExitStack, contextmanager
-from dataclasses import fields
-from enum import Enum
-from functools import partial, wraps
-from hashlib import sha256
-from itertools import chain
-from pathlib import Path
-from types import ModuleType
-from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
-from urllib.parse import urlparse
-from uuid import uuid4
-from zipfile import ZipFile, is_zipfile
-
-import numpy as np
-from packaging import version
-
-import requests
-from filelock import FileLock
-from huggingface_hub import HfFolder, Repository, create_repo, list_repo_files, whoami
-from requests.exceptions import HTTPError
-from transformers.utils.logging import tqdm
-from transformers.utils.versions import importlib_metadata
 
 from . import __version__
-from .utils import logging
 
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
-ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
-
-USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
-
-if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
-    _torch_available = importlib.util.find_spec("torch") is not None
-    if _torch_available:
-        try:
-            _torch_version = importlib_metadata.version("torch")
-            logger.info(f"PyTorch version {_torch_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _torch_available = False
-else:
-    logger.info("Disabling PyTorch because USE_TF is set")
-    _torch_available = False
-
-
-if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
-    _tf_available = importlib.util.find_spec("tensorflow") is not None
-    if _tf_available:
-        candidates = (
-            "tensorflow",
-            "tensorflow-cpu",
-            "tensorflow-gpu",
-            "tf-nightly",
-            "tf-nightly-cpu",
-            "tf-nightly-gpu",
-            "intel-tensorflow",
-            "intel-tensorflow-avx512",
-            "tensorflow-rocm",
-            "tensorflow-macos",
-        )
-        _tf_version = None
-        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
-        for pkg in candidates:
-            try:
-                _tf_version = importlib_metadata.version(pkg)
-                break
-            except importlib_metadata.PackageNotFoundError:
-                pass
-        _tf_available = _tf_version is not None
-    if _tf_available:
-        if version.parse(_tf_version) < version.parse("2"):
-            logger.info(f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum.")
-            _tf_available = False
-        else:
-            logger.info(f"TensorFlow version {_tf_version} available.")
-else:
-    logger.info("Disabling Tensorflow because USE_TORCH is set")
-    _tf_available = False
-
-
-if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
-    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
-    if _flax_available:
-        try:
-            _jax_version = importlib_metadata.version("jax")
-            _flax_version = importlib_metadata.version("flax")
-            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _flax_available = False
-else:
-    _flax_available = False
-
-
-_datasets_available = importlib.util.find_spec("datasets") is not None
-try:
-    # Check we're not importing a "datasets" directory somewhere but the actual library by trying to grab the version
-    # AND checking it has an author field in the metadata that is HuggingFace.
-    _ = importlib_metadata.version("datasets")
-    _datasets_metadata = importlib_metadata.metadata("datasets")
-    if _datasets_metadata.get("author", "") != "HuggingFace Inc.":
-        _datasets_available = False
-except importlib_metadata.PackageNotFoundError:
-    _datasets_available = False
-
-
-_detectron2_available = importlib.util.find_spec("detectron2") is not None
-try:
-    _detectron2_version = importlib_metadata.version("detectron2")
-    logger.debug(f"Successfully imported detectron2 version {_detectron2_version}")
-except importlib_metadata.PackageNotFoundError:
-    _detectron2_available = False
-
-
-_faiss_available = importlib.util.find_spec("faiss") is not None
-try:
-    _faiss_version = importlib_metadata.version("faiss")
-    logger.debug(f"Successfully imported faiss version {_faiss_version}")
-except importlib_metadata.PackageNotFoundError:
-    try:
-        _faiss_version = importlib_metadata.version("faiss-cpu")
-        logger.debug(f"Successfully imported faiss version {_faiss_version}")
-    except importlib_metadata.PackageNotFoundError:
-        _faiss_available = False
-
-_ftfy_available = importlib.util.find_spec("ftfy") is not None
-try:
-    _ftfy_version = importlib_metadata.version("ftfy")
-    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
-except importlib_metadata.PackageNotFoundError:
-    _ftfy_available = False
-
-
-coloredlogs = importlib.util.find_spec("coloredlogs") is not None
-try:
-    _coloredlogs_available = importlib_metadata.version("coloredlogs")
-    logger.debug(f"Successfully imported sympy version {_coloredlogs_available}")
-except importlib_metadata.PackageNotFoundError:
-    _coloredlogs_available = False
-
-
-sympy_available = importlib.util.find_spec("sympy") is not None
-try:
-    _sympy_available = importlib_metadata.version("sympy")
-    logger.debug(f"Successfully imported sympy version {_sympy_available}")
-except importlib_metadata.PackageNotFoundError:
-    _sympy_available = False
-
-
-_tf2onnx_available = importlib.util.find_spec("tf2onnx") is not None
-try:
-    _tf2onnx_version = importlib_metadata.version("tf2onnx")
-    logger.debug(f"Successfully imported tf2onnx version {_tf2onnx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tf2onnx_available = False
-
-_onnx_available = importlib.util.find_spec("onnxruntime") is not None
-try:
-    _onxx_version = importlib_metadata.version("onnx")
-    logger.debug(f"Successfully imported onnx version {_onxx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _onnx_available = False
-
-
-_scatter_available = importlib.util.find_spec("torch_scatter") is not None
-try:
-    _scatter_version = importlib_metadata.version("torch_scatter")
-    logger.debug(f"Successfully imported torch-scatter version {_scatter_version}")
-except importlib_metadata.PackageNotFoundError:
-    _scatter_available = False
-
-
-_pytorch_quantization_available = importlib.util.find_spec("pytorch_quantization") is not None
-try:
-    _pytorch_quantization_version = importlib_metadata.version("pytorch_quantization")
-    logger.debug(f"Successfully imported pytorch-quantization version {_pytorch_quantization_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pytorch_quantization_available = False
-
-
-_soundfile_available = importlib.util.find_spec("soundfile") is not None
-try:
-    _soundfile_version = importlib_metadata.version("soundfile")
-    logger.debug(f"Successfully imported soundfile version {_soundfile_version}")
-except importlib_metadata.PackageNotFoundError:
-    _soundfile_available = False
-
-
-_tensorflow_probability_available = importlib.util.find_spec("tensorflow_probability") is not None
-try:
-    _tensorflow_probability_version = importlib_metadata.version("tensorflow_probability")
-    logger.debug(f"Successfully imported tensorflow-probability version {_tensorflow_probability_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tensorflow_probability_available = False
-
-
-_timm_available = importlib.util.find_spec("timm") is not None
-try:
-    _timm_version = importlib_metadata.version("timm")
-    logger.debug(f"Successfully imported timm version {_timm_version}")
-except importlib_metadata.PackageNotFoundError:
-    _timm_available = False
-
-
-_torchaudio_available = importlib.util.find_spec("torchaudio") is not None
-try:
-    _torchaudio_version = importlib_metadata.version("torchaudio")
-    logger.debug(f"Successfully imported torchaudio version {_torchaudio_version}")
-except importlib_metadata.PackageNotFoundError:
-    _torchaudio_available = False
-
-
-_phonemizer_available = importlib.util.find_spec("phonemizer") is not None
-try:
-    _phonemizer_version = importlib_metadata.version("phonemizer")
-    logger.debug(f"Successfully imported phonemizer version {_phonemizer_version}")
-except importlib_metadata.PackageNotFoundError:
-    _phonemizer_available = False
-
-
-_pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None
-try:
-    _pyctcdecode_version = importlib_metadata.version("pyctcdecode")
-    logger.debug(f"Successfully imported pyctcdecode version {_pyctcdecode_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pyctcdecode_available = False
-
-
-_librosa_available = importlib.util.find_spec("librosa") is not None
-try:
-    _librosa_version = importlib_metadata.version("librosa")
-    logger.debug(f"Successfully imported librosa version {_librosa_version}")
-except importlib_metadata.PackageNotFoundError:
-    _librosa_available = False
-
-
-torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-old_default_cache_path = os.path.join(torch_cache_home, "transformers")
-# New default cache, shared with the Datasets library
-hf_cache_home = os.path.expanduser(
-    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+# Backward compatibility imports, to make sure all those objects can be found in file_utils
+from .utils import (
+    CLOUDFRONT_DISTRIB_PREFIX,
+    CONFIG_NAME,
+    DISABLE_TELEMETRY,
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    ENV_VARS_TRUE_AND_AUTO_VALUES,
+    ENV_VARS_TRUE_VALUES,
+    FEATURE_EXTRACTOR_NAME,
+    FLAX_WEIGHTS_NAME,
+    HF_MODULES_CACHE,
+    HUGGINGFACE_CO_PREFIX,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    MODEL_CARD_NAME,
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    S3_BUCKET_PREFIX,
+    SENTENCEPIECE_UNDERLINE,
+    SPIECE_UNDERLINE,
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    TORCH_FX_REQUIRED_VERSION,
+    TRANSFORMERS_CACHE,
+    TRANSFORMERS_DYNAMIC_MODULE_NAME,
+    USE_JAX,
+    USE_TF,
+    USE_TORCH,
+    WEIGHTS_NAME,
+    ContextManagers,
+    DummyObject,
+    EntryNotFoundError,
+    ExplicitEnum,
+    ModelOutput,
+    PaddingStrategy,
+    PushToHubMixin,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    TensorType,
+    _LazyModule,
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    cached_path,
+    cached_property,
+    copy_func,
+    default_cache_path,
+    define_sagemaker_information,
+    filename_to_url,
+    get_cached_models,
+    get_file_from_repo,
+    get_from_cache,
+    get_full_repo_name,
+    get_list_of_files,
+    has_file,
+    hf_bucket_url,
+    http_get,
+    http_user_agent,
+    is_apex_available,
+    is_coloredlogs_available,
+    is_datasets_available,
+    is_detectron2_available,
+    is_faiss_available,
+    is_flax_available,
+    is_ftfy_available,
+    is_in_notebook,
+    is_librosa_available,
+    is_local_clone,
+    is_offline_mode,
+    is_onnx_available,
+    is_pandas_available,
+    is_phonemizer_available,
+    is_protobuf_available,
+    is_psutil_available,
+    is_py3nvml_available,
+    is_pyctcdecode_available,
+    is_pytesseract_available,
+    is_pytorch_quantization_available,
+    is_remote_url,
+    is_rjieba_available,
+    is_sagemaker_dp_enabled,
+    is_sagemaker_mp_enabled,
+    is_scatter_available,
+    is_scipy_available,
+    is_sentencepiece_available,
+    is_sklearn_available,
+    is_soundfile_availble,
+    is_spacy_available,
+    is_speech_available,
+    is_tensor,
+    is_tensorflow_probability_available,
+    is_tf2onnx_available,
+    is_tf_available,
+    is_timm_available,
+    is_tokenizers_available,
+    is_torch_available,
+    is_torch_bf16_available,
+    is_torch_cuda_available,
+    is_torch_fx_available,
+    is_torch_fx_proxy,
+    is_torch_onnx_dict_inputs_support_available,
+    is_torch_tf32_available,
+    is_torch_tpu_available,
+    is_torchaudio_available,
+    is_training_run_on_sagemaker,
+    is_vision_available,
+    replace_return_docstrings,
+    requires_backends,
+    tf_required,
+    to_numpy,
+    to_py_obj,
+    torch_only_method,
+    torch_required,
+    torch_version,
+    url_to_filename,
 )
-default_cache_path = os.path.join(hf_cache_home, "transformers")
-
-# Onetime move from the old location to the new one if no ENV variable has been set.
-if (
-    os.path.isdir(old_default_cache_path)
-    and not os.path.isdir(default_cache_path)
-    and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
-    and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
-    and "TRANSFORMERS_CACHE" not in os.environ
-):
-    logger.warning(
-        "In Transformers v4.0.0, the default path to cache downloaded models changed from "
-        "'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
-        "and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
-        "'~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should "
-        "only see this message once."
-    )
-    shutil.move(old_default_cache_path, default_cache_path)
-
-PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
-TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
-HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
-TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
-SESSION_ID = uuid4().hex
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False) in ENV_VARS_TRUE_VALUES
-
-WEIGHTS_NAME = "pytorch_model.bin"
-TF2_WEIGHTS_NAME = "tf_model.h5"
-TF_WEIGHTS_NAME = "model.ckpt"
-FLAX_WEIGHTS_NAME = "flax_model.msgpack"
-CONFIG_NAME = "config.json"
-FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
-MODEL_CARD_NAME = "modelcard.json"
-
-SENTENCEPIECE_UNDERLINE = "▁"
-SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility
-
-MULTIPLE_CHOICE_DUMMY_INPUTS = [
-    [[0, 1, 0, 1], [1, 0, 0, 1]]
-] * 2  # Needs to have 0s and 1s only since XLM uses it for langs too.
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
-
-S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
-CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
-
-_staging_mode = os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES
-_default_endpoint = "https://moon-staging.huggingface.co" if _staging_mode else "https://huggingface.co"
-
-HUGGINGFACE_CO_RESOLVE_ENDPOINT = _default_endpoint
-if os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", None) is not None:
-    warnings.warn(
-        "Using the environment variable `HUGGINGFACE_CO_RESOLVE_ENDPOINT` is deprecated and will be removed in "
-        "Transformers v5. Use `HF_ENDPOINT` instead.",
-        FutureWarning,
-    )
-    HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", None)
-HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", HUGGINGFACE_CO_RESOLVE_ENDPOINT)
-HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}"
-
-# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
-TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
-TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
-
-_is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False
-
-
-def is_offline_mode():
-    return _is_offline_mode
-
-
-def is_torch_available():
-    return _torch_available
-
-
-def is_pyctcdecode_available():
-    return _pyctcdecode_available
-
-
-def is_librosa_available():
-    return _librosa_available
-
-
-def is_torch_cuda_available():
-    if is_torch_available():
-        import torch
-
-        return torch.cuda.is_available()
-    else:
-        return False
-
-
-def is_torch_bf16_available():
-    if not is_torch_available():
-        return False
-
-    import torch
-
-    # since currently no utility function is available we build our own.
-    # some bits come from https://github.com/pytorch/pytorch/blob/2289a12f21c54da93bf5d696e3f9aea83dd9c10d/torch/testing/_internal/common_cuda.py#L51
-    # with additional check for torch version
-    # to succeed:
-    # 1. the hardware needs to support bf16 (arch >= Ampere)
-    # 2. torch >= 1.10 (1.9 should be enough for AMP API has changed in 1.10, so using 1.10 as minimal)
-    # 3. CUDA >= 11
-    # 4. torch.autocast exists
-    # XXX: one problem here is that it may give invalid results on mixed gpus setup, so it's
-    # really only correct for the 0th gpu (or currently set default device if different from 0)
-
-    if not torch.cuda.is_available() or torch.version.cuda is None:
-        return False
-    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
-        return False
-    if int(torch.version.cuda.split(".")[0]) < 11:
-        return False
-    if version.parse(torch.__version__) < version.parse("1.10"):
-        return False
-    if not hasattr(torch, "autocast"):
-        return False
-
-    return True
-
-
-def is_torch_tf32_available():
-    if not is_torch_available():
-        return False
-
-    import torch
-
-    if not torch.cuda.is_available() or torch.version.cuda is None:
-        return False
-    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
-        return False
-    if int(torch.version.cuda.split(".")[0]) < 11:
-        return False
-    if version.parse(torch.__version__) < version.parse("1.7"):
-        return False
-
-    return True
-
-
-_torch_fx_available = _torch_onnx_dict_inputs_support_available = False
-if _torch_available:
-    torch_version = version.parse(importlib_metadata.version("torch"))
-    _torch_fx_available = (torch_version.major, torch_version.minor) == (
-        TORCH_FX_REQUIRED_VERSION.major,
-        TORCH_FX_REQUIRED_VERSION.minor,
-    )
-
-    _torch_onnx_dict_inputs_support_available = torch_version >= TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION
-
-
-def is_torch_fx_available():
-    return _torch_fx_available
-
-
-def is_torch_onnx_dict_inputs_support_available():
-    return _torch_onnx_dict_inputs_support_available
-
-
-def is_tf_available():
-    return _tf_available
-
-
-def is_coloredlogs_available():
-    return _coloredlogs_available
-
-
-def is_tf2onnx_available():
-    return _tf2onnx_available
-
-
-def is_onnx_available():
-    return _onnx_available
-
-
-def is_flax_available():
-    return _flax_available
-
-
-def is_ftfy_available():
-    return _ftfy_available
-
-
-def is_torch_tpu_available():
-    if not _torch_available:
-        return False
-    # This test is probably enough, but just in case, we unpack a bit.
-    if importlib.util.find_spec("torch_xla") is None:
-        return False
-    if importlib.util.find_spec("torch_xla.core") is None:
-        return False
-    return importlib.util.find_spec("torch_xla.core.xla_model") is not None
-
-
-def is_datasets_available():
-    return _datasets_available
-
-
-def is_detectron2_available():
-    return _detectron2_available
-
-
-def is_rjieba_available():
-    return importlib.util.find_spec("rjieba") is not None
-
-
-def is_psutil_available():
-    return importlib.util.find_spec("psutil") is not None
-
-
-def is_py3nvml_available():
-    return importlib.util.find_spec("py3nvml") is not None
-
-
-def is_apex_available():
-    return importlib.util.find_spec("apex") is not None
-
-
-def is_faiss_available():
-    return _faiss_available
-
-
-def is_scipy_available():
-    return importlib.util.find_spec("scipy") is not None
-
-
-def is_sklearn_available():
-    if importlib.util.find_spec("sklearn") is None:
-        return False
-    return is_scipy_available() and importlib.util.find_spec("sklearn.metrics")
-
-
-def is_sentencepiece_available():
-    return importlib.util.find_spec("sentencepiece") is not None
-
-
-def is_protobuf_available():
-    if importlib.util.find_spec("google") is None:
-        return False
-    return importlib.util.find_spec("google.protobuf") is not None
-
-
-def is_tokenizers_available():
-    return importlib.util.find_spec("tokenizers") is not None
-
-
-def is_vision_available():
-    return importlib.util.find_spec("PIL") is not None
-
-
-def is_pytesseract_available():
-    return importlib.util.find_spec("pytesseract") is not None
-
-
-def is_spacy_available():
-    return importlib.util.find_spec("spacy") is not None
-
-
-def is_in_notebook():
-    try:
-        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
-        get_ipython = sys.modules["IPython"].get_ipython
-        if "IPKernelApp" not in get_ipython().config:
-            raise ImportError("console")
-        if "VSCODE_PID" in os.environ:
-            raise ImportError("vscode")
-
-        return importlib.util.find_spec("IPython") is not None
-    except (AttributeError, ImportError, KeyError):
-        return False
-
-
-def is_scatter_available():
-    return _scatter_available
-
-
-def is_pytorch_quantization_available():
-    return _pytorch_quantization_available
-
-
-def is_tensorflow_probability_available():
-    return _tensorflow_probability_available
-
-
-def is_pandas_available():
-    return importlib.util.find_spec("pandas") is not None
-
-
-def is_sagemaker_dp_enabled():
-    # Get the sagemaker specific env variable.
-    sagemaker_params = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
-    try:
-        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
-        sagemaker_params = json.loads(sagemaker_params)
-        if not sagemaker_params.get("sagemaker_distributed_dataparallel_enabled", False):
-            return False
-    except json.JSONDecodeError:
-        return False
-    # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
-
-
-def is_sagemaker_mp_enabled():
-    # Get the sagemaker specific mp parameters from smp_options variable.
-    smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
-    try:
-        # Parse it and check the field "partitions" is included, it is required for model parallel.
-        smp_options = json.loads(smp_options)
-        if "partitions" not in smp_options:
-            return False
-    except json.JSONDecodeError:
-        return False
-
-    # Get the sagemaker specific framework parameters from mpi_options variable.
-    mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
-    try:
-        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
-        mpi_options = json.loads(mpi_options)
-        if not mpi_options.get("sagemaker_mpi_enabled", False):
-            return False
-    except json.JSONDecodeError:
-        return False
-    # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
-
-
-def is_training_run_on_sagemaker():
-    return "SAGEMAKER_JOB_NAME" in os.environ
-
-
-def is_soundfile_availble():
-    return _soundfile_available
-
-
-def is_timm_available():
-    return _timm_available
-
-
-def is_torchaudio_available():
-    return _torchaudio_available
-
-
-def is_speech_available():
-    # For now this depends on torchaudio but the exact dependency might evolve in the future.
-    return _torchaudio_available
-
-
-def is_phonemizer_available():
-    return _phonemizer_available
-
-
-def torch_only_method(fn):
-    def wrapper(*args, **kwargs):
-        if not _torch_available:
-            raise ImportError(
-                "You need to install pytorch to use this method or class, "
-                "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
-            )
-        else:
-            return fn(*args, **kwargs)
-
-    return wrapper
-
-
-# docstyle-ignore
-DATASETS_IMPORT_ERROR = """
-{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
-```
-pip install datasets
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install datasets
-```
-then restarting your kernel.
-
-Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
-working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
-that python file if that's the case.
-"""
-
-
-# docstyle-ignore
-TOKENIZERS_IMPORT_ERROR = """
-{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
-```
-pip install tokenizers
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install tokenizers
-```
-"""
-
-
-# docstyle-ignore
-SENTENCEPIECE_IMPORT_ERROR = """
-{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-PROTOBUF_IMPORT_ERROR = """
-{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-FAISS_IMPORT_ERROR = """
-{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-PYTORCH_IMPORT_ERROR = """
-{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
-installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-SKLEARN_IMPORT_ERROR = """
-{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
-```
-pip install -U scikit-learn
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install -U scikit-learn
-```
-"""
-
-
-# docstyle-ignore
-TENSORFLOW_IMPORT_ERROR = """
-{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
-installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-DETECTRON2_IMPORT_ERROR = """
-{0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
-installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-FLAX_IMPORT_ERROR = """
-{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
-installation page: https://github.com/google/flax and follow the ones that match your environment.
-"""
-
-# docstyle-ignore
-FTFY_IMPORT_ERROR = """
-{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
-installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-SCATTER_IMPORT_ERROR = """
-{0} requires the torch-scatter library but it was not found in your environment. You can install it with pip as
-explained here: https://github.com/rusty1s/pytorch_scatter.
-"""
-
-# docstyle-ignore
-PYTORCH_QUANTIZATION_IMPORT_ERROR = """
-{0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
-`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
-"""
-
-# docstyle-ignore
-TENSORFLOW_PROBABILITY_IMPORT_ERROR = """
-{0} requires the tensorflow_probability library but it was not found in your environment. You can install it with pip as
-explained here: https://github.com/tensorflow/probability.
-"""
-
-
-# docstyle-ignore
-PANDAS_IMPORT_ERROR = """
-{0} requires the pandas library but it was not found in your environment. You can install it with pip as
-explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html.
-"""
-
-
-# docstyle-ignore
-PHONEMIZER_IMPORT_ERROR = """
-{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
-`pip install phonemizer`
-"""
-
-
-# docstyle-ignore
-SCIPY_IMPORT_ERROR = """
-{0} requires the scipy library but it was not found in your environment. You can install it with pip:
-`pip install scipy`
-"""
-
-
-# docstyle-ignore
-SPEECH_IMPORT_ERROR = """
-{0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
-`pip install torchaudio`
-"""
-
-# docstyle-ignore
-TIMM_IMPORT_ERROR = """
-{0} requires the timm library but it was not found in your environment. You can install it with pip:
-`pip install timm`
-"""
-
-# docstyle-ignore
-VISION_IMPORT_ERROR = """
-{0} requires the PIL library but it was not found in your environment. You can install it with pip:
-`pip install pillow`
-"""
-
-
-# docstyle-ignore
-PYTESSERACT_IMPORT_ERROR = """
-{0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
-`pip install pytesseract`
-"""
-
-# docstyle-ignore
-PYCTCDECODE_IMPORT_ERROR = """
-{0} requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
-`pip install pyctcdecode`
-"""
-
-
-BACKENDS_MAPPING = OrderedDict(
-    [
-        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
-        ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
-        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
-        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
-        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
-        ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
-        ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
-        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
-        ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
-        ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
-        ("scatter", (is_scatter_available, SCATTER_IMPORT_ERROR)),
-        ("pytorch_quantization", (is_pytorch_quantization_available, PYTORCH_QUANTIZATION_IMPORT_ERROR)),
-        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
-        ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
-        ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
-        ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)),
-        ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
-        ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
-        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
-        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
-        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
-        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
-    ]
-)
-
-
-def requires_backends(obj, backends):
-    if not isinstance(backends, (list, tuple)):
-        backends = [backends]
-
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    checks = (BACKENDS_MAPPING[backend] for backend in backends)
-    failed = [msg.format(name) for available, msg in checks if not available()]
-    if failed:
-        raise ImportError("".join(failed))
-
-
-class DummyObject(type):
-    """
-    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
-    `requires_backend` each time a user tries to access any method of that class.
-    """
-
-    def __getattr__(cls, key):
-        if key.startswith("_"):
-            return super().__getattr__(cls, key)
-        requires_backends(cls, cls._backends)
-
-
-def add_start_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-def add_start_docstrings_to_model_forward(*docstr):
-    def docstring_decorator(fn):
-        docstring = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        class_name = f"[`{fn.__qualname__.split('.')[0]}`]"
-        intro = f"   The {class_name} forward method, overrides the `__call__` special method."
-        note = r"""
-
-    <Tip>
-
-    Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]
-    instance afterwards instead of this since the former takes care of running the pre and post processing steps while
-    the latter silently ignores them.
-
-    </Tip>
-"""
-
-        fn.__doc__ = intro + note + docstring
-        return fn
-
-    return docstring_decorator
-
-
-def add_end_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else "") + "".join(docstr)
-        return fn
-
-    return docstring_decorator
-
-
-PT_RETURN_INTRODUCTION = r"""
-    Returns:
-        [`{full_output_type}`] or `tuple(torch.FloatTensor)`: A [`{full_output_type}`] or a tuple of
-        `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various
-        elements depending on the configuration ([`{config_class}`]) and inputs.
-
-"""
-
-
-TF_RETURN_INTRODUCTION = r"""
-    Returns:
-        [`{full_output_type}`] or `tuple(tf.Tensor)`: A [`{full_output_type}`] or a tuple of `tf.Tensor` (if
-        `return_dict=False` is passed or when `config.return_dict=False`) comprising various elements depending on the
-        configuration ([`{config_class}`]) and inputs.
-
-"""
-
-
-def _get_indent(t):
-    """Returns the indentation in the first line of t"""
-    search = re.search(r"^(\s*)\S", t)
-    return "" if search is None else search.groups()[0]
-
-
-def _convert_output_args_doc(output_args_doc):
-    """Convert output_args_doc to display properly."""
-    # Split output_arg_doc in blocks argument/description
-    indent = _get_indent(output_args_doc)
-    blocks = []
-    current_block = ""
-    for line in output_args_doc.split("\n"):
-        # If the indent is the same as the beginning, the line is the name of new arg.
-        if _get_indent(line) == indent:
-            if len(current_block) > 0:
-                blocks.append(current_block[:-1])
-            current_block = f"{line}\n"
-        else:
-            # Otherwise it's part of the description of the current arg.
-            # We need to remove 2 spaces to the indentation.
-            current_block += f"{line[2:]}\n"
-    blocks.append(current_block[:-1])
-
-    # Format each block for proper rendering
-    for i in range(len(blocks)):
-        blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i])
-        blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i])
-
-    return "\n".join(blocks)
-
-
-def _prepare_output_docstrings(output_type, config_class, min_indent=None):
-    """
-    Prepares the return part of the docstring using `output_type`.
-    """
-    output_docstring = output_type.__doc__
-
-    # Remove the head of the docstring to keep the list of args only
-    lines = output_docstring.split("\n")
-    i = 0
-    while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None:
-        i += 1
-    if i < len(lines):
-        params_docstring = "\n".join(lines[(i + 1) :])
-        params_docstring = _convert_output_args_doc(params_docstring)
-
-    # Add the return introduction
-    full_output_type = f"{output_type.__module__}.{output_type.__name__}"
-    intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION
-    intro = intro.format(full_output_type=full_output_type, config_class=config_class)
-    result = intro + params_docstring
-
-    # Apply minimum indent if necessary
-    if min_indent is not None:
-        lines = result.split("\n")
-        # Find the indent of the first nonempty line
-        i = 0
-        while len(lines[i]) == 0:
-            i += 1
-        indent = len(_get_indent(lines[i]))
-        # If too small, add indentation to all nonempty lines
-        if indent < min_indent:
-            to_add = " " * (min_indent - indent)
-            lines = [(f"{to_add}{line}" if len(line) > 0 else line) for line in lines]
-            result = "\n".join(lines)
-
-    return result
-
-
-PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
-
-    >>> outputs = model(**inputs, labels=labels)
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    ```
-"""
-
-PT_QUESTION_ANSWERING_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-
-    >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> inputs = tokenizer(question, text, return_tensors="pt")
-    >>> start_positions = torch.tensor([1])
-    >>> end_positions = torch.tensor([3])
-
-    >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
-    >>> loss = outputs.loss
-    >>> round(loss.item(), 2)
-    {expected_loss}
-
-    >>> start_scores = outputs.start_logits
-    >>> list(start_scores.shape)
-    {expected_output}
-
-    >>> end_scores = outputs.end_logits
-    >>> list(end_scores.shape)
-    {expected_output}
-    ```
-"""
-
-PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
-    Example of single-label classification:
-
-    ```python
-    >>> import torch
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=2)
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-    >>> outputs = model(**inputs, labels=labels)
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    >>> list(logits.shape)
-    {expected_output}
-    ```
-
-    Example of multi-label classification:
-
-    ```python
-    >>> import torch
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification", num_labels=2)
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([[1, 1]], dtype=torch.float)  # need dtype=float for BCEWithLogitsLoss
-    >>> outputs = model(**inputs, labels=labels)
-    >>> loss = outputs.loss
-    >>> list(logits.shape)
-    {expected_output}
-    ```
-"""
-
-
-PT_MASKED_LM_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
-    >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
-
-    >>> outputs = model(**inputs, labels=labels)
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    ```
-"""
-
-PT_BASE_MODEL_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> outputs = model(**inputs)
-
-    >>> last_hidden_states = outputs.last_hidden_state
-    ```
-"""
-
-PT_MULTIPLE_CHOICE_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-    >>> choice0 = "It is eaten with a fork and a knife."
-    >>> choice1 = "It is eaten while held in the hand."
-    >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
-
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
-    >>> outputs = model(**{{k: v.unsqueeze(0) for k, v in encoding.items()}}, labels=labels)  # batch size is 1
-
-    >>> # the linear classifier still needs to be trained
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    ```
-"""
-
-PT_CAUSAL_LM_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> import torch
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> outputs = model(**inputs, labels=inputs["input_ids"])
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    ```
-"""
-
-PT_SPEECH_BASE_MODEL_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-    >>> from datasets import load_dataset
-
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
-    >>> dataset = dataset.sort("id")
-    >>> sampling_rate = dataset.features["audio"].sampling_rate
-
-    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> # audio file is decoded on the fly
-    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
-    >>> with torch.no_grad():
-    ...     outputs = model(**inputs)
-
-    >>> last_hidden_states = outputs.last_hidden_state
-    >>> list(last_hidden_states.shape)
-    {expected_output}
-    ```
-"""
-
-PT_SPEECH_CTC_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> from datasets import load_dataset
-    >>> import torch
-
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
-    >>> dataset = dataset.sort("id")
-    >>> sampling_rate = dataset.features["audio"].sampling_rate
-
-    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> # audio file is decoded on the fly
-    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
-    >>> with torch.no_grad():
-    ...     logits = model(**inputs).logits
-    >>> predicted_ids = torch.argmax(logits, dim=-1)
-
-    >>> # transcribe speech
-    >>> transcription = processor.batch_decode(predicted_ids)
-    >>> transcription[0]
-    {expected_output}
-    ```
-
-    ```python
-    >>> with processor.as_target_processor():
-    ...     inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids
-
-    >>> # compute loss
-    >>> loss = model(**inputs).loss
-    >>> round(loss.item(), 2)
-    {expected_loss}
-    ```
-"""
-
-PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> from datasets import load_dataset
-    >>> import torch
-
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
-    >>> dataset = dataset.sort("id")
-    >>> sampling_rate = dataset.features["audio"].sampling_rate
-
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> # audio file is decoded on the fly
-    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
-
-    >>> with torch.no_grad():
-    ...     logits = model(**inputs).logits
-
-    >>> predicted_class_ids = torch.argmax(logits, dim=-1).item()
-    >>> predicted_label = model.config.id2label[predicted_class_ids]
-    >>> predicted_label
-    {expected_output}
-    ```
-
-    ```python
-    >>> # compute loss - target_label is e.g. "down"
-    >>> target_label = model.config.id2label[0]
-    >>> inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
-    >>> loss = model(**inputs).loss
-    >>> round(loss.item(), 2)
-    {expected_loss}
-    ```
-"""
-
-
-PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> from datasets import load_dataset
-    >>> import torch
-
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
-    >>> dataset = dataset.sort("id")
-    >>> sampling_rate = dataset.features["audio"].sampling_rate
-
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> # audio file is decoded on the fly
-    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=sampling_rate)
-    >>> with torch.no_grad():
-    ...     logits = model(**inputs).logits
-
-    >>> probabilities = torch.sigmoid(logits[0])
-    >>> # labels is a one-hot array of shape (num_frames, num_speakers)
-    >>> labels = (probabilities > 0.5).long()
-    >>> labels[0].tolist()
-    {expected_output}
-    ```
-"""
-
-
-PT_SPEECH_XVECTOR_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> from datasets import load_dataset
-    >>> import torch
-
-    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
-    >>> dataset = dataset.sort("id")
-    >>> sampling_rate = dataset.features["audio"].sampling_rate
-
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> # audio file is decoded on the fly
-    >>> inputs = feature_extractor(
-    ...     [d["array"] for d in dataset[:2]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True
-    ... )
-    >>> with torch.no_grad():
-    ...     embeddings = model(**inputs).embeddings
-
-    >>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
-
-    >>> # the resulting embeddings can be used for cosine similarity-based retrieval
-    >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
-    >>> similarity = cosine_sim(embeddings[0], embeddings[1])
-    >>> threshold = 0.7  # the optimal threshold is dataset-dependent
-    >>> if similarity < threshold:
-    ...     print("Speakers are not the same!")
-    >>> round(similarity.item(), 2)
-    {expected_output}
-    ```
-"""
-
-PT_VISION_BASE_MODEL_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-    >>> from datasets import load_dataset
-
-    >>> dataset = load_dataset("huggingface/cats-image")
-    >>> image = dataset["test"]["image"][0]
-
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = feature_extractor(image, return_tensors="pt")
-
-    >>> with torch.no_grad():
-    ...     outputs = model(**inputs)
-
-    >>> last_hidden_states = outputs.last_hidden_state
-    >>> list(last_hidden_states.shape)
-    {expected_output}
-    ```
-"""
-
-PT_VISION_SEQ_CLASS_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import torch
-    >>> from datasets import load_dataset
-
-    >>> dataset = load_dataset("huggingface/cats-image")
-    >>> image = dataset["test"]["image"][0]
-
-    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = feature_extractor(image, return_tensors="pt")
-
-    >>> with torch.no_grad():
-    ...     logits = model(**inputs).logits
-
-    >>> # model predicts one of the 1000 ImageNet classes
-    >>> predicted_label = logits.argmax(-1).item()
-    >>> print(model.config.id2label[predicted_label])
-    {expected_output}
-    ```
-"""
-
-
-PT_SAMPLE_DOCSTRINGS = {
-    "SequenceClassification": PT_SEQUENCE_CLASSIFICATION_SAMPLE,
-    "QuestionAnswering": PT_QUESTION_ANSWERING_SAMPLE,
-    "TokenClassification": PT_TOKEN_CLASSIFICATION_SAMPLE,
-    "MultipleChoice": PT_MULTIPLE_CHOICE_SAMPLE,
-    "MaskedLM": PT_MASKED_LM_SAMPLE,
-    "LMHead": PT_CAUSAL_LM_SAMPLE,
-    "BaseModel": PT_BASE_MODEL_SAMPLE,
-    "SpeechBaseModel": PT_SPEECH_BASE_MODEL_SAMPLE,
-    "CTC": PT_SPEECH_CTC_SAMPLE,
-    "AudioClassification": PT_SPEECH_SEQ_CLASS_SAMPLE,
-    "AudioFrameClassification": PT_SPEECH_FRAME_CLASS_SAMPLE,
-    "AudioXVector": PT_SPEECH_XVECTOR_SAMPLE,
-    "VisionBaseModel": PT_VISION_BASE_MODEL_SAMPLE,
-    "ImageClassification": PT_VISION_SEQ_CLASS_SAMPLE,
-}
-
-
-TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import tensorflow as tf
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-    >>> input_ids = inputs["input_ids"]
-    >>> inputs["labels"] = tf.reshape(
-    ...     tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))
-    >>> )  # Batch size 1
-
-    >>> outputs = model(inputs)
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    ```
-"""
-
-TF_QUESTION_ANSWERING_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import tensorflow as tf
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> input_dict = tokenizer(question, text, return_tensors="tf")
-    >>> outputs = model(input_dict)
-    >>> start_logits = outputs.start_logits
-    >>> end_logits = outputs.end_logits
-
-    >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-    >>> answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1])
-    ```
-"""
-
-TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import tensorflow as tf
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-    >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1))  # Batch size 1
-
-    >>> outputs = model(inputs)
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    ```
-"""
-
-TF_MASKED_LM_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import tensorflow as tf
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
-    >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
-
-    >>> outputs = model(inputs)
-    >>> loss = outputs.loss
-    >>> logits = outputs.logits
-    ```
-"""
-
-TF_BASE_MODEL_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import tensorflow as tf
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-    >>> outputs = model(inputs)
-
-    >>> last_hidden_states = outputs.last_hidden_state
-    ```
-"""
-
-TF_MULTIPLE_CHOICE_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import tensorflow as tf
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-    >>> choice0 = "It is eaten with a fork and a knife."
-    >>> choice1 = "It is eaten while held in the hand."
-
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="tf", padding=True)
-    >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
-    >>> outputs = model(inputs)  # batch size is 1
-
-    >>> # the linear classifier still needs to be trained
-    >>> logits = outputs.logits
-    ```
-"""
-
-TF_CAUSAL_LM_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-    >>> import tensorflow as tf
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-    >>> outputs = model(inputs)
-    >>> logits = outputs.logits
-    ```
-"""
-
-TF_SAMPLE_DOCSTRINGS = {
-    "SequenceClassification": TF_SEQUENCE_CLASSIFICATION_SAMPLE,
-    "QuestionAnswering": TF_QUESTION_ANSWERING_SAMPLE,
-    "TokenClassification": TF_TOKEN_CLASSIFICATION_SAMPLE,
-    "MultipleChoice": TF_MULTIPLE_CHOICE_SAMPLE,
-    "MaskedLM": TF_MASKED_LM_SAMPLE,
-    "LMHead": TF_CAUSAL_LM_SAMPLE,
-    "BaseModel": TF_BASE_MODEL_SAMPLE,
-}
-
-
-FLAX_TOKEN_CLASSIFICATION_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
-
-    >>> outputs = model(**inputs)
-    >>> logits = outputs.logits
-    ```
-"""
-
-FLAX_QUESTION_ANSWERING_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> inputs = tokenizer(question, text, return_tensors="jax")
-
-    >>> outputs = model(**inputs)
-    >>> start_scores = outputs.start_logits
-    >>> end_scores = outputs.end_logits
-    ```
-"""
-
-FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
-
-    >>> outputs = model(**inputs)
-    >>> logits = outputs.logits
-    ```
-"""
-
-FLAX_MASKED_LM_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax")
-
-    >>> outputs = model(**inputs)
-    >>> logits = outputs.logits
-    ```
-"""
-
-FLAX_BASE_MODEL_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
-    >>> outputs = model(**inputs)
-
-    >>> last_hidden_states = outputs.last_hidden_state
-    ```
-"""
-
-FLAX_MULTIPLE_CHOICE_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-    >>> choice0 = "It is eaten with a fork and a knife."
-    >>> choice1 = "It is eaten while held in the hand."
-
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="jax", padding=True)
-    >>> outputs = model(**{{k: v[None, :] for k, v in encoding.items()}})
-
-    >>> logits = outputs.logits
-    ```
-"""
-
-FLAX_CAUSAL_LM_SAMPLE = r"""
-    Example:
-
-    ```python
-    >>> from transformers import {processor_class}, {model_class}
-
-    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
-    >>> model = {model_class}.from_pretrained("{checkpoint}")
-
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
-    >>> outputs = model(**inputs)
-
-    >>> # retrieve logts for next token
-    >>> next_token_logits = outputs.logits[:, -1]
-    ```
-"""
-
-FLAX_SAMPLE_DOCSTRINGS = {
-    "SequenceClassification": FLAX_SEQUENCE_CLASSIFICATION_SAMPLE,
-    "QuestionAnswering": FLAX_QUESTION_ANSWERING_SAMPLE,
-    "TokenClassification": FLAX_TOKEN_CLASSIFICATION_SAMPLE,
-    "MultipleChoice": FLAX_MULTIPLE_CHOICE_SAMPLE,
-    "MaskedLM": FLAX_MASKED_LM_SAMPLE,
-    "BaseModel": FLAX_BASE_MODEL_SAMPLE,
-    "LMHead": FLAX_CAUSAL_LM_SAMPLE,
-}
-
-
-def add_code_sample_docstrings(
-    *docstr,
-    processor_class=None,
-    checkpoint=None,
-    output_type=None,
-    config_class=None,
-    mask="[MASK]",
-    model_cls=None,
-    modality=None,
-    expected_output="",
-    expected_loss="",
-):
-    def docstring_decorator(fn):
-        # model_class defaults to function's class if not specified otherwise
-        model_class = fn.__qualname__.split(".")[0] if model_cls is None else model_cls
-
-        if model_class[:2] == "TF":
-            sample_docstrings = TF_SAMPLE_DOCSTRINGS
-        elif model_class[:4] == "Flax":
-            sample_docstrings = FLAX_SAMPLE_DOCSTRINGS
-        else:
-            sample_docstrings = PT_SAMPLE_DOCSTRINGS
-
-        # putting all kwargs for docstrings in a dict to be used
-        # with the `.format(**doc_kwargs)`. Note that string might
-        # be formatted with non-existing keys, which is fine.
-        doc_kwargs = dict(
-            model_class=model_class,
-            processor_class=processor_class,
-            checkpoint=checkpoint,
-            mask=mask,
-            expected_output=expected_output,
-            expected_loss=expected_loss,
-        )
-
-        if "SequenceClassification" in model_class and modality == "audio":
-            code_sample = sample_docstrings["AudioClassification"]
-        elif "SequenceClassification" in model_class:
-            code_sample = sample_docstrings["SequenceClassification"]
-        elif "QuestionAnswering" in model_class:
-            code_sample = sample_docstrings["QuestionAnswering"]
-        elif "TokenClassification" in model_class:
-            code_sample = sample_docstrings["TokenClassification"]
-        elif "MultipleChoice" in model_class:
-            code_sample = sample_docstrings["MultipleChoice"]
-        elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]:
-            code_sample = sample_docstrings["MaskedLM"]
-        elif "LMHead" in model_class or "CausalLM" in model_class:
-            code_sample = sample_docstrings["LMHead"]
-        elif "CTC" in model_class:
-            code_sample = sample_docstrings["CTC"]
-        elif "AudioFrameClassification" in model_class:
-            code_sample = sample_docstrings["AudioFrameClassification"]
-        elif "XVector" in model_class and modality == "audio":
-            code_sample = sample_docstrings["AudioXVector"]
-        elif "Model" in model_class and modality == "audio":
-            code_sample = sample_docstrings["SpeechBaseModel"]
-        elif "Model" in model_class and modality == "vision":
-            code_sample = sample_docstrings["VisionBaseModel"]
-        elif "Model" in model_class or "Encoder" in model_class:
-            code_sample = sample_docstrings["BaseModel"]
-        elif "ImageClassification" in model_class:
-            code_sample = sample_docstrings["ImageClassification"]
-        else:
-            raise ValueError(f"Docstring can't be built for model {model_class}")
-
-        func_doc = (fn.__doc__ or "") + "".join(docstr)
-        output_doc = "" if output_type is None else _prepare_output_docstrings(output_type, config_class)
-        built_doc = code_sample.format(**doc_kwargs)
-        fn.__doc__ = func_doc + output_doc + built_doc
-        return fn
-
-    return docstring_decorator
-
-
-def replace_return_docstrings(output_type=None, config_class=None):
-    def docstring_decorator(fn):
-        func_doc = fn.__doc__
-        lines = func_doc.split("\n")
-        i = 0
-        while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None:
-            i += 1
-        if i < len(lines):
-            indent = len(_get_indent(lines[i]))
-            lines[i] = _prepare_output_docstrings(output_type, config_class, min_indent=indent)
-            func_doc = "\n".join(lines)
-        else:
-            raise ValueError(
-                f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, "
-                f"current docstring is:\n{func_doc}"
-            )
-        fn.__doc__ = func_doc
-        return fn
-
-    return docstring_decorator
-
-
-def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https")
-
-
-def hf_bucket_url(
-    model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None
-) -> str:
-    """
-    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting
-    to Cloudfront (a Content Delivery Network, or CDN) for large files.
-
-    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
-    bandwidth costs).
-
-    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
-    because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront
-    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
-    can't ever be stale.
-
-    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is:
-    its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0
-    are not shared with those new files, because the cached file's name contains a hash of the url (which changed).
-    """
-    if subfolder is not None:
-        filename = f"{subfolder}/{filename}"
-
-    if mirror:
-        if mirror in ["tuna", "bfsu"]:
-            raise ValueError("The Tuna and BFSU mirrors are no longer available. Try removing the mirror argument.")
-        legacy_format = "/" not in model_id
-        if legacy_format:
-            return f"{mirror}/{model_id}-{filename}"
-        else:
-            return f"{mirror}/{model_id}/{filename}"
-
-    if revision is None:
-        revision = "main"
-    return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
-
-
-def url_to_filename(url: str, etag: Optional[str] = None) -> str:
-    """
-    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
-    delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
-    identify it as a HDF5 file (see
-    https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    """
-    url_bytes = url.encode("utf-8")
-    filename = sha256(url_bytes).hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        filename += "." + sha256(etag_bytes).hexdigest()
-
-    if url.endswith(".h5"):
-        filename += ".h5"
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be `None`) stored for *filename*. Raise `EnvironmentError` if *filename* or its
-    stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError(f"file {cache_path} not found")
-
-    meta_path = cache_path + ".json"
-    if not os.path.exists(meta_path):
-        raise EnvironmentError(f"file {meta_path} not found")
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata["url"]
-    etag = metadata["etag"]
-
-    return url, etag
-
-
-def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
-    """
-    Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape `(model_url,
-    etag, size_MB)`. Filenames in `cache_dir` are use to get the metadata for each model, only urls ending with *.bin*
-    are added.
-
-    Args:
-        cache_dir (`Union[str, Path]`, *optional*):
-            The cache directory to search for models within. Will default to the transformers cache if unset.
-
-    Returns:
-        List[Tuple]: List of tuples each with shape `(model_url, etag, size_MB)`
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    elif isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cached_models = []
-    for file in os.listdir(cache_dir):
-        if file.endswith(".json"):
-            meta_path = os.path.join(cache_dir, file)
-            with open(meta_path, encoding="utf-8") as meta_file:
-                metadata = json.load(meta_file)
-                url = metadata["url"]
-                etag = metadata["etag"]
-                if url.endswith(".bin"):
-                    size_MB = os.path.getsize(meta_path.strip(".json")) / 1e6
-                    cached_models.append((url, etag, size_MB))
-
-    return cached_models
-
-
-def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent: Union[Dict, str, None] = None,
-    extract_compressed_file=False,
-    force_extract=False,
-    use_auth_token: Union[bool, str, None] = None,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file
-    and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and
-    then return the path
-
-    Args:
-        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-download the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletely received file is found.
-        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
-        use_auth_token: Optional string or boolean to use as Bearer token for remote files. If True,
-            will get token from ~/.huggingface.
-        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
-            file in a folder along the archive.
-        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
-            re-extract the archive and override the folder where it was extracted.
-
-    Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
-
-    Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_offline_mode() and not local_files_only:
-        logger.info("Offline mode: forcing local_files_only=True")
-        local_files_only = True
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-            use_auth_token=use_auth_token,
-            local_files_only=local_files_only,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        output_path = url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError(f"file {url_or_filename} not found")
-    else:
-        # Something unknown
-        raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path")
-
-    if extract_compressed_file:
-        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
-        output_dir, output_file = os.path.split(output_path)
-        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-
-        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted)
-            if is_zipfile(output_path):
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            else:
-                raise EnvironmentError(f"Archive format of {output_path} could not be identified")
-
-        return output_path_extracted
-
-    return output_path
-
-
-def define_sagemaker_information():
-    try:
-        instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json()
-        dlc_container_used = instance_data["Image"]
-        dlc_tag = instance_data["Image"].split(":")[1]
-    except Exception:
-        dlc_container_used = None
-        dlc_tag = None
-
-    sagemaker_params = json.loads(os.getenv("SM_FRAMEWORK_PARAMS", "{}"))
-    runs_distributed_training = True if "sagemaker_distributed_dataparallel_enabled" in sagemaker_params else False
-    account_id = os.getenv("TRAINING_JOB_ARN").split(":")[4] if "TRAINING_JOB_ARN" in os.environ else None
-
-    sagemaker_object = {
-        "sm_framework": os.getenv("SM_FRAMEWORK_MODULE", None),
-        "sm_region": os.getenv("AWS_REGION", None),
-        "sm_number_gpu": os.getenv("SM_NUM_GPUS", 0),
-        "sm_number_cpu": os.getenv("SM_NUM_CPUS", 0),
-        "sm_distributed_training": runs_distributed_training,
-        "sm_deep_learning_container": dlc_container_used,
-        "sm_deep_learning_container_tag": dlc_tag,
-        "sm_account_id": account_id,
-    }
-    return sagemaker_object
-
-
-def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
-    """
-    Formats a user-agent string with basic info about a request.
-    """
-    ua = f"transformers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
-    if is_torch_available():
-        ua += f"; torch/{_torch_version}"
-    if is_tf_available():
-        ua += f"; tensorflow/{_tf_version}"
-    if DISABLE_TELEMETRY:
-        return ua + "; telemetry/off"
-    if is_training_run_on_sagemaker():
-        ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items())
-    # CI will set this value to True
-    if os.environ.get("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
-        ua += "; is_ci/true"
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    return ua
-
-
-class RepositoryNotFoundError(HTTPError):
-    """
-    Raised when trying to access a hf.co URL with an invalid repository name, or with a private repo name the user does
-    not have access to.
-    """
-
-
-class EntryNotFoundError(HTTPError):
-    """Raised when trying to access a hf.co URL with a valid repository and revision but an invalid filename."""
-
-
-class RevisionNotFoundError(HTTPError):
-    """Raised when trying to access a hf.co URL with a valid repository but an invalid revision."""
-
-
-def _raise_for_status(request):
-    """
-    Internal version of `request.raise_for_status()` that will refine a potential HTTPError.
-    """
-    if "X-Error-Code" in request.headers:
-        error_code = request.headers["X-Error-Code"]
-        if error_code == "RepoNotFound":
-            raise RepositoryNotFoundError(f"404 Client Error: Repository Not Found for url: {request.url}")
-        elif error_code == "EntryNotFound":
-            raise EntryNotFoundError(f"404 Client Error: Entry Not Found for url: {request.url}")
-        elif error_code == "RevisionNotFound":
-            raise RevisionNotFoundError((f"404 Client Error: Revision Not Found for url: {request.url}"))
-
-    request.raise_for_status()
-
-
-def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None):
-    """
-    Download remote file. Do not gobble up errors.
-    """
-    headers = copy.deepcopy(headers)
-    if resume_size > 0:
-        headers["Range"] = f"bytes={resume_size}-"
-    r = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    _raise_for_status(r)
-    content_length = r.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()`
-    # and can be set using `utils.logging.enable/disable_progress_bar()`
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        unit_divisor=1024,
-        total=total,
-        initial=resume_size,
-        desc="Downloading",
-    )
-    for chunk in r.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(
-    url: str,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent: Union[Dict, str, None] = None,
-    use_auth_token: Union[bool, str, None] = None,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
-    path to the cached file.
-
-    Return:
-        Local path (string) of file or if networking is off, last version of file cached on disk.
-
-    Raises:
-        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    headers = {"user-agent": http_user_agent(user_agent)}
-    if isinstance(use_auth_token, str):
-        headers["authorization"] = f"Bearer {use_auth_token}"
-    elif use_auth_token:
-        token = HfFolder.get_token()
-        if token is None:
-            raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
-        headers["authorization"] = f"Bearer {token}"
-
-    url_to_download = url
-    etag = None
-    if not local_files_only:
-        try:
-            r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
-            _raise_for_status(r)
-            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
-            # We favor a custom header indicating the etag of the linked resource, and
-            # we fallback to the regular etag header.
-            # If we don't have any of those, raise an error.
-            if etag is None:
-                raise OSError(
-                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
-                )
-            # In case of a redirect,
-            # save an extra redirect on the request.get call,
-            # and ensure we download the exact atomic version even if it changed
-            # between the HEAD and the GET (unlikely, but hey).
-            if 300 <= r.status_code <= 399:
-                url_to_download = r.headers["Location"]
-        except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
-            # Actually raise for those subclasses of ConnectionError
-            raise
-        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
-            # Otherwise, our Internet connection is down.
-            # etag is None
-            pass
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None == we don't have a connection or we passed local_files_only.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                # If files cannot be found and local_files_only=True,
-                # the models might've been found if local_files_only=False
-                # Notify the user about that
-                if local_files_only:
-                    raise FileNotFoundError(
-                        "Cannot find the requested files in the cached path and outgoing traffic has been"
-                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-                        " to False."
-                    )
-                else:
-                    raise ValueError(
-                        "Connection error, and we cannot find the requested files in the cached path."
-                        " Please try again or make sure your Internet connection is on."
-                    )
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-
-        # If the download just completed while the lock was activated.
-        if os.path.exists(cache_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return cache_path
-
-        if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager() -> "io.BufferedWriter":
-                with open(incomplete_path, "ab") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}")
-
-            http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers)
-
-        logger.info(f"storing {url} in cache at {cache_path}")
-        os.replace(temp_file.name, cache_path)
-
-        # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it.
-        umask = os.umask(0o666)
-        os.umask(umask)
-        os.chmod(cache_path, 0o666 & ~umask)
-
-        logger.info(f"creating metadata file for {cache_path}")
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-
-    return cache_path
-
-
-def get_file_from_repo(
-    path_or_repo: Union[str, os.PathLike],
-    filename: str,
-    cache_dir: Optional[Union[str, os.PathLike]] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-):
-    """
-    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
-
-    Args:
-        path_or_repo (`str` or `os.PathLike`):
-            This can be either:
-
-            - a string, the *model id* of a model repo on huggingface.co.
-            - a path to a *directory* potentially containing the file.
-        filename (`str`):
-            The name of the file to locate in `path_or_repo`.
-        cache_dir (`str` or `os.PathLike`, *optional*):
-            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
-            cache should not be used.
-        force_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to force to (re-)download the configuration files and override the cached versions if they
-            exist.
-        resume_download (`bool`, *optional*, defaults to `False`):
-            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (`Dict[str, str]`, *optional*):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            If `True`, will only try to load the tokenizer configuration from local files.
-
-    <Tip>
-
-    Passing `use_auth_token=True` is required when you want to use a private model.
-
-    </Tip>
-
-    Returns:
-        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo) or `None` if the
-        file does not exist.
-
-    Examples:
-
-    ```python
-    # Download a tokenizer configuration from huggingface.co and cache.
-    tokenizer_config = get_file_from_repo("bert-base-uncased", "tokenizer_config.json")
-    # This model does not have a tokenizer config so the result will be None.
-    tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
-    ```"""
-    if is_offline_mode() and not local_files_only:
-        logger.info("Offline mode: forcing local_files_only=True")
-        local_files_only = True
-
-    path_or_repo = str(path_or_repo)
-    if os.path.isdir(path_or_repo):
-        resolved_file = os.path.join(path_or_repo, filename)
-        return resolved_file if os.path.isfile(resolved_file) else None
-    else:
-        resolved_file = hf_bucket_url(path_or_repo, filename=filename, revision=revision, mirror=None)
-
-    try:
-        # Load from URL or cache if already cached
-        resolved_file = cached_path(
-            resolved_file,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-        )
-
-    except RepositoryNotFoundError:
-        raise EnvironmentError(
-            f"{path_or_repo} is not a local folder and is not a valid model identifier "
-            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
-            "pass a token having permission to this repo with `use_auth_token` or log in with "
-            "`huggingface-cli login` and pass `use_auth_token=True`."
-        )
-    except RevisionNotFoundError:
-        raise EnvironmentError(
-            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
-            "for this model name. Check the model page at "
-            f"'https://huggingface.co/{path_or_repo}' for available revisions."
-        )
-    except EnvironmentError:
-        # The repo and revision exist, but the file does not or there was a connection error fetching it.
-        return None
-
-    return resolved_file
-
-
-def has_file(
-    path_or_repo: Union[str, os.PathLike],
-    filename: str,
-    revision: Optional[str] = None,
-    mirror: Optional[str] = None,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-):
-    """
-    Checks if a repo contains a given file wihtout downloading it. Works for remote repos and local folders.
-
-    <Tip warning={false}>
-
-    This function will raise an error if the repository `path_or_repo` is not valid or if `revision` does not exist for
-    this repo, but will return False for regular connection errors.
-
-    </Tip>
-    """
-    if os.path.isdir(path_or_repo):
-        return os.path.isfile(os.path.join(path_or_repo, filename))
-
-    url = hf_bucket_url(path_or_repo, filename=filename, revision=revision, mirror=mirror)
-
-    headers = {"user-agent": http_user_agent()}
-    if isinstance(use_auth_token, str):
-        headers["authorization"] = f"Bearer {use_auth_token}"
-    elif use_auth_token:
-        token = HfFolder.get_token()
-        if token is None:
-            raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
-        headers["authorization"] = f"Bearer {token}"
-
-    r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=10)
-    try:
-        _raise_for_status(r)
-        return True
-    except RepositoryNotFoundError as e:
-        logger.error(e)
-        raise EnvironmentError(f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'.")
-    except RevisionNotFoundError as e:
-        logger.error(e)
-        raise EnvironmentError(
-            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
-            "model name. Check the model page at 'https://huggingface.co/{path_or_repo}' for available revisions."
-        )
-    except requests.HTTPError:
-        # We return false for EntryNotFoundError (logical) as well as any connection error.
-        return False
-
-
-def get_list_of_files(
-    path_or_repo: Union[str, os.PathLike],
-    revision: Optional[str] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    local_files_only: bool = False,
-) -> List[str]:
-    """
-    Gets the list of files inside `path_or_repo`.
-
-    Args:
-        path_or_repo (`str` or `os.PathLike`):
-            Can be either the id of a repo on huggingface.co or a path to a *directory*.
-        revision (`str`, *optional*, defaults to `"main"`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-            identifier allowed by git.
-        use_auth_token (`str` or *bool*, *optional*):
-            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-            when running `transformers-cli login` (stored in `~/.huggingface`).
-        local_files_only (`bool`, *optional*, defaults to `False`):
-            Whether or not to only rely on local files and not to attempt to download any files.
-
-    <Tip warning={true}>
-
-    This API is not optimized, so calling it a lot may result in connection errors.
-
-    </Tip>
-
-    Returns:
-        `List[str]`: The list of files available in `path_or_repo`.
-    """
-    path_or_repo = str(path_or_repo)
-    # If path_or_repo is a folder, we just return what is inside (subdirectories included).
-    if os.path.isdir(path_or_repo):
-        list_of_files = []
-        for path, dir_names, file_names in os.walk(path_or_repo):
-            list_of_files.extend([os.path.join(path, f) for f in file_names])
-        return list_of_files
-
-    # Can't grab the files if we are on offline mode.
-    if is_offline_mode() or local_files_only:
-        return []
-
-    # Otherwise we grab the token and use the list_repo_files method.
-    if isinstance(use_auth_token, str):
-        token = use_auth_token
-    elif use_auth_token is True:
-        token = HfFolder.get_token()
-    else:
-        token = None
-
-    try:
-        return list_repo_files(path_or_repo, revision=revision, token=token)
-    except HTTPError as e:
-        raise ValueError(
-            f"{path_or_repo} is not a local path or a model identifier on the model Hub. Did you make a typo?"
-        ) from e
-
-
-class cached_property(property):
-    """
-    Descriptor that mimics @property but caches output in member variable.
-
-    From tensorflow_datasets
-
-    Built-in in functools from Python 3.8.
-    """
-
-    def __get__(self, obj, objtype=None):
-        # See docs.python.org/3/howto/descriptor.html#properties
-        if obj is None:
-            return self
-        if self.fget is None:
-            raise AttributeError("unreadable attribute")
-        attr = "__cached_" + self.fget.__name__
-        cached = getattr(obj, attr, None)
-        if cached is None:
-            cached = self.fget(obj)
-            setattr(obj, attr, cached)
-        return cached
-
-
-def torch_required(func):
-    # Chose a different decorator name than in tests so it's clear they are not the same.
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if is_torch_available():
-            return func(*args, **kwargs)
-        else:
-            raise ImportError(f"Method `{func.__name__}` requires PyTorch.")
-
-    return wrapper
-
-
-def tf_required(func):
-    # Chose a different decorator name than in tests so it's clear they are not the same.
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        if is_tf_available():
-            return func(*args, **kwargs)
-        else:
-            raise ImportError(f"Method `{func.__name__}` requires TF.")
-
-    return wrapper
-
-
-def is_torch_fx_proxy(x):
-    if is_torch_fx_available():
-        import torch.fx
-
-        return isinstance(x, torch.fx.Proxy)
-    return False
-
-
-def is_tensor(x):
-    """
-    Tests if `x` is a `torch.Tensor`, `tf.Tensor`, `jaxlib.xla_extension.DeviceArray` or `np.ndarray`.
-    """
-    if is_torch_fx_proxy(x):
-        return True
-    if is_torch_available():
-        import torch
-
-        if isinstance(x, torch.Tensor):
-            return True
-    if is_tf_available():
-        import tensorflow as tf
-
-        if isinstance(x, tf.Tensor):
-            return True
-
-    if is_flax_available():
-        import jax.numpy as jnp
-        from jax.core import Tracer
-
-        if isinstance(x, (jnp.ndarray, Tracer)):
-            return True
-
-    return isinstance(x, np.ndarray)
-
-
-def _is_numpy(x):
-    return isinstance(x, np.ndarray)
-
-
-def _is_torch(x):
-    import torch
-
-    return isinstance(x, torch.Tensor)
-
-
-def _is_torch_device(x):
-    import torch
-
-    return isinstance(x, torch.device)
-
-
-def _is_tensorflow(x):
-    import tensorflow as tf
-
-    return isinstance(x, tf.Tensor)
-
-
-def _is_jax(x):
-    import jax.numpy as jnp  # noqa: F811
-
-    return isinstance(x, jnp.ndarray)
-
-
-def to_py_obj(obj):
-    """
-    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
-    """
-    if isinstance(obj, (dict, UserDict)):
-        return {k: to_py_obj(v) for k, v in obj.items()}
-    elif isinstance(obj, (list, tuple)):
-        return [to_py_obj(o) for o in obj]
-    elif is_tf_available() and _is_tensorflow(obj):
-        return obj.numpy().tolist()
-    elif is_torch_available() and _is_torch(obj):
-        return obj.detach().cpu().tolist()
-    elif is_flax_available() and _is_jax(obj):
-        return np.asarray(obj).tolist()
-    elif isinstance(obj, (np.ndarray, np.number)):  # tolist also works on 0d np arrays
-        return obj.tolist()
-    else:
-        return obj
-
-
-def to_numpy(obj):
-    """
-    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
-    """
-    if isinstance(obj, (dict, UserDict)):
-        return {k: to_numpy(v) for k, v in obj.items()}
-    elif isinstance(obj, (list, tuple)):
-        return np.array(obj)
-    elif is_tf_available() and _is_tensorflow(obj):
-        return obj.numpy()
-    elif is_torch_available() and _is_torch(obj):
-        return obj.detach().cpu().numpy()
-    elif is_flax_available() and _is_jax(obj):
-        return np.asarray(obj)
-    else:
-        return obj
-
-
-class ModelOutput(OrderedDict):
-    """
-    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
-    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
-    python dictionary.
-
-    <Tip warning={true}>
-
-    You can't unpack a `ModelOutput` directly. Use the [`~file_utils.ModelOutput.to_tuple`] method to convert it to a
-    tuple before.
-
-    </Tip>
-    """
-
-    def __post_init__(self):
-        class_fields = fields(self)
-
-        # Safety and consistency checks
-        if not len(class_fields):
-            raise ValueError(f"{self.__class__.__name__} has no fields.")
-        if not all(field.default is None for field in class_fields[1:]):
-            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
-
-        first_field = getattr(self, class_fields[0].name)
-        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
-
-        if other_fields_are_none and not is_tensor(first_field):
-            if isinstance(first_field, dict):
-                iterator = first_field.items()
-                first_field_iterator = True
-            else:
-                try:
-                    iterator = iter(first_field)
-                    first_field_iterator = True
-                except TypeError:
-                    first_field_iterator = False
-
-            # if we provided an iterator as first field and the iterator is a (key, value) iterator
-            # set the associated fields
-            if first_field_iterator:
-                for element in iterator:
-                    if (
-                        not isinstance(element, (list, tuple))
-                        or not len(element) == 2
-                        or not isinstance(element[0], str)
-                    ):
-                        break
-                    setattr(self, element[0], element[1])
-                    if element[1] is not None:
-                        self[element[0]] = element[1]
-            elif first_field is not None:
-                self[class_fields[0].name] = first_field
-        else:
-            for field in class_fields:
-                v = getattr(self, field.name)
-                if v is not None:
-                    self[field.name] = v
-
-    def __delitem__(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
-
-    def setdefault(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
-
-    def pop(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
-
-    def update(self, *args, **kwargs):
-        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
-
-    def __getitem__(self, k):
-        if isinstance(k, str):
-            inner_dict = {k: v for (k, v) in self.items()}
-            return inner_dict[k]
-        else:
-            return self.to_tuple()[k]
-
-    def __setattr__(self, name, value):
-        if name in self.keys() and value is not None:
-            # Don't call self.__setitem__ to avoid recursion errors
-            super().__setitem__(name, value)
-        super().__setattr__(name, value)
-
-    def __setitem__(self, key, value):
-        # Will raise a KeyException if needed
-        super().__setitem__(key, value)
-        # Don't call self.__setattr__ to avoid recursion errors
-        super().__setattr__(key, value)
-
-    def to_tuple(self) -> Tuple[Any]:
-        """
-        Convert self to a tuple containing all the attributes/keys that are not `None`.
-        """
-        return tuple(self[k] for k in self.keys())
-
-
-class ExplicitEnum(Enum):
-    """
-    Enum with more explicit error message for missing values.
-    """
-
-    @classmethod
-    def _missing_(cls, value):
-        raise ValueError(
-            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
-        )
-
-
-class PaddingStrategy(ExplicitEnum):
-    """
-    Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
-    IDE.
-    """
-
-    LONGEST = "longest"
-    MAX_LENGTH = "max_length"
-    DO_NOT_PAD = "do_not_pad"
-
-
-class TensorType(ExplicitEnum):
-    """
-    Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
-    tab-completion in an IDE.
-    """
-
-    PYTORCH = "pt"
-    TENSORFLOW = "tf"
-    NUMPY = "np"
-    JAX = "jax"
-
-
-class _LazyModule(ModuleType):
-    """
-    Module class that surfaces all objects but only performs associated imports when the objects are requested.
-    """
-
-    # Very heavily inspired by optuna.integration._IntegrationModule
-    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
-    def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
-        super().__init__(name)
-        self._modules = set(import_structure.keys())
-        self._class_to_module = {}
-        for key, values in import_structure.items():
-            for value in values:
-                self._class_to_module[value] = key
-        # Needed for autocompletion in an IDE
-        self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
-        self.__file__ = module_file
-        self.__spec__ = module_spec
-        self.__path__ = [os.path.dirname(module_file)]
-        self._objects = {} if extra_objects is None else extra_objects
-        self._name = name
-        self._import_structure = import_structure
-
-    # Needed for autocompletion in an IDE
-    def __dir__(self):
-        result = super().__dir__()
-        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
-        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
-        for attr in self.__all__:
-            if attr not in result:
-                result.append(attr)
-        return result
-
-    def __getattr__(self, name: str) -> Any:
-        if name in self._objects:
-            return self._objects[name]
-        if name in self._modules:
-            value = self._get_module(name)
-        elif name in self._class_to_module.keys():
-            module = self._get_module(self._class_to_module[name])
-            value = getattr(module, name)
-        else:
-            raise AttributeError(f"module {self.__name__} has no attribute {name}")
-
-        setattr(self, name, value)
-        return value
-
-    def _get_module(self, module_name: str):
-        try:
-            return importlib.import_module("." + module_name, self.__name__)
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}"
-            ) from e
-
-    def __reduce__(self):
-        return (self.__class__, (self._name, self.__file__, self._import_structure))
-
-
-def copy_func(f):
-    """Returns a copy of a function f."""
-    # Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)
-    g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__, closure=f.__closure__)
-    g = functools.update_wrapper(g, f)
-    g.__kwdefaults__ = f.__kwdefaults__
-    return g
-
-
-def is_local_clone(repo_path, repo_url):
-    """
-    Checks if the folder in `repo_path` is a local clone of `repo_url`.
-    """
-    # First double-check that `repo_path` is a git repo
-    if not os.path.exists(os.path.join(repo_path, ".git")):
-        return False
-    test_git = subprocess.run("git branch".split(), cwd=repo_path)
-    if test_git.returncode != 0:
-        return False
-
-    # Then look at its remotes
-    remotes = subprocess.run(
-        "git remote -v".split(),
-        stderr=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-        check=True,
-        encoding="utf-8",
-        cwd=repo_path,
-    ).stdout
-
-    return repo_url in remotes.split()
-
-
-class PushToHubMixin:
-    """
-    A Mixin containing the functionality to push a model or tokenizer to the hub.
-    """
-
-    def push_to_hub(
-        self,
-        repo_path_or_name: Optional[str] = None,
-        repo_url: Optional[str] = None,
-        use_temp_dir: bool = False,
-        commit_message: Optional[str] = None,
-        organization: Optional[str] = None,
-        private: Optional[bool] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        **model_card_kwargs
-    ) -> str:
-        """
-        Upload the {object_files} to the 🤗 Model Hub while synchronizing a local clone of the repo in
-        `repo_path_or_name`.
-
-        Parameters:
-            repo_path_or_name (`str`, *optional*):
-                Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case
-                the repository will have the name of that local folder). If not specified, will default to the name
-                given by `repo_url` and a local directory with that name will be created.
-            repo_url (`str`, *optional*):
-                Specify this in case you want to push to an existing repository in the hub. If unspecified, a new
-                repository will be created in your namespace (unless you specify an `organization`) with `repo_name`.
-            use_temp_dir (`bool`, *optional*, defaults to `False`):
-                Whether or not to clone the distant repo in a temporary directory or in `repo_path_or_name` inside the
-                current working directory. This will slow things down if you are making changes in an existing repo
-                since you will need to clone the repo before every push.
-            commit_message (`str`, *optional*):
-                Message to commit while pushing. Will default to `"add {object}"`.
-            organization (`str`, *optional*):
-                Organization in which you want to push your {object} (you must be a member of this organization).
-            private (`bool`, *optional*):
-                Whether or not the repository created should be private (requires a paying subscription).
-            use_auth_token (`bool` or `str`, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `transformers-cli login` (stored in `~/.huggingface`). Will default to `True` if
-                `repo_url` is not specified.
-
-
-        Returns:
-            `str`: The url of the commit of your {object} in the given repository.
-
-        Examples:
-
-        ```python
-        from transformers import {object_class}
-
-        {object} = {object_class}.from_pretrained("bert-base-cased")
-
-        # Push the {object} to your namespace with the name "my-finetuned-bert" and have a local clone in the
-        # *my-finetuned-bert* folder.
-        {object}.push_to_hub("my-finetuned-bert")
-
-        # Push the {object} to your namespace with the name "my-finetuned-bert" with no local clone.
-        {object}.push_to_hub("my-finetuned-bert", use_temp_dir=True)
-
-        # Push the {object} to an organization with the name "my-finetuned-bert" and have a local clone in the
-        # *my-finetuned-bert* folder.
-        {object}.push_to_hub("my-finetuned-bert", organization="huggingface")
-
-        # Make a change to an existing repo that has been cloned locally in *my-finetuned-bert*.
-        {object}.push_to_hub("my-finetuned-bert", repo_url="https://huggingface.co/sgugger/my-finetuned-bert")
-        ```
-        """
-        if use_temp_dir:
-            # Make sure we use the right `repo_name` for the `repo_url` before replacing it.
-            if repo_url is None:
-                if use_auth_token is None:
-                    use_auth_token = True
-                repo_name = Path(repo_path_or_name).name
-                repo_url = self._get_repo_url_from_name(
-                    repo_name, organization=organization, private=private, use_auth_token=use_auth_token
-                )
-            repo_path_or_name = tempfile.mkdtemp()
-
-        # Create or clone the repo. If the repo is already cloned, this just retrieves the path to the repo.
-        repo = self._create_or_get_repo(
-            repo_path_or_name=repo_path_or_name,
-            repo_url=repo_url,
-            organization=organization,
-            private=private,
-            use_auth_token=use_auth_token,
-        )
-        # Save the files in the cloned repo
-        self.save_pretrained(repo_path_or_name)
-        if hasattr(self, "history") and hasattr(self, "create_model_card"):
-            # This is a Keras model and we might be able to fish out its History and make a model card out of it
-            base_model_card_args = {
-                "output_dir": repo_path_or_name,
-                "model_name": Path(repo_path_or_name).name,
-            }
-            base_model_card_args.update(model_card_kwargs)
-            self.create_model_card(**base_model_card_args)
-        # Commit and push!
-        url = self._push_to_hub(repo, commit_message=commit_message)
-
-        # Clean up! Clean up! Everybody everywhere!
-        if use_temp_dir:
-            shutil.rmtree(repo_path_or_name)
-
-        return url
-
-    @staticmethod
-    def _get_repo_url_from_name(
-        repo_name: str,
-        organization: Optional[str] = None,
-        private: bool = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
-    ) -> str:
-        if isinstance(use_auth_token, str):
-            token = use_auth_token
-        elif use_auth_token:
-            token = HfFolder.get_token()
-            if token is None:
-                raise ValueError(
-                    "You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and "
-                    "entering your credentials to use `use_auth_token=True`. Alternatively, you can pass your own "
-                    "token as the `use_auth_token` argument."
-                )
-        else:
-            token = None
-
-        # Special provision for the test endpoint (CI)
-        return create_repo(
-            token,
-            repo_name,
-            organization=organization,
-            private=private,
-            repo_type=None,
-            exist_ok=True,
-        )
-
-    @classmethod
-    def _create_or_get_repo(
-        cls,
-        repo_path_or_name: Optional[str] = None,
-        repo_url: Optional[str] = None,
-        organization: Optional[str] = None,
-        private: bool = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
-    ) -> Repository:
-        if repo_path_or_name is None and repo_url is None:
-            raise ValueError("You need to specify a `repo_path_or_name` or a `repo_url`.")
-
-        if use_auth_token is None and repo_url is None:
-            use_auth_token = True
-
-        if repo_path_or_name is None:
-            repo_path_or_name = repo_url.split("/")[-1]
-
-        if repo_url is None and not os.path.exists(repo_path_or_name):
-            repo_name = Path(repo_path_or_name).name
-            repo_url = cls._get_repo_url_from_name(
-                repo_name, organization=organization, private=private, use_auth_token=use_auth_token
-            )
-
-        # Create a working directory if it does not exist.
-        if not os.path.exists(repo_path_or_name):
-            os.makedirs(repo_path_or_name)
-
-        repo = Repository(repo_path_or_name, clone_from=repo_url, use_auth_token=use_auth_token)
-        repo.git_pull()
-        return repo
-
-    @classmethod
-    def _push_to_hub(cls, repo: Repository, commit_message: Optional[str] = None) -> str:
-        if commit_message is None:
-            if "Tokenizer" in cls.__name__:
-                commit_message = "add tokenizer"
-            elif "Config" in cls.__name__:
-                commit_message = "add config"
-            else:
-                commit_message = "add model"
-
-        return repo.push_to_hub(commit_message=commit_message)
-
-
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
-    if token is None:
-        token = HfFolder.get_token()
-    if organization is None:
-        username = whoami(token)["name"]
-        return f"{username}/{model_id}"
-    else:
-        return f"{organization}/{model_id}"
-
-
-class ContextManagers:
-    """
-    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
-    in the `fastcore` library.
-    """
-
-    def __init__(self, context_managers: List[ContextManager]):
-        self.context_managers = context_managers
-        self.stack = ExitStack()
-
-    def __enter__(self):
-        for context_manager in self.context_managers:
-            self.stack.enter_context(context_manager)
-
-    def __exit__(self, *args, **kwargs):
-        self.stack.__exit__(*args, **kwargs)
diff --git a/src/transformers/generation_beam_search.py b/src/transformers/generation_beam_search.py
index 1980a5efb9..aa9b3ec66c 100644
--- a/src/transformers/generation_beam_search.py
+++ b/src/transformers/generation_beam_search.py
@@ -21,8 +21,8 @@ from typing import List, Optional, Tuple
 import numpy as np
 import torch
 
-from .file_utils import add_start_docstrings
 from .generation_beam_constraints import Constraint, ConstraintListState
+from .utils import add_start_docstrings
 
 
 PROCESS_INPUTS_DOCSTRING = r"""
diff --git a/src/transformers/generation_flax_logits_process.py b/src/transformers/generation_flax_logits_process.py
index 6db4130c12..b41da1b9b2 100644
--- a/src/transformers/generation_flax_logits_process.py
+++ b/src/transformers/generation_flax_logits_process.py
@@ -19,7 +19,7 @@ import jax
 import jax.lax as lax
 import jax.numpy as jnp
 
-from .file_utils import add_start_docstrings
+from .utils import add_start_docstrings
 from .utils.logging import get_logger
 
 
diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index 60e931eeec..812a7202f0 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -25,7 +25,6 @@ import jax
 import jax.numpy as jnp
 from jax import lax
 
-from .file_utils import ModelOutput
 from .generation_flax_logits_process import (
     FlaxForcedBOSTokenLogitsProcessor,
     FlaxForcedEOSTokenLogitsProcessor,
@@ -35,7 +34,7 @@ from .generation_flax_logits_process import (
     FlaxTopKLogitsWarper,
     FlaxTopPLogitsWarper,
 )
-from .utils import logging
+from .utils import ModelOutput, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index d3d2183737..1b0762079d 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -20,7 +20,7 @@ from typing import Callable, Iterable, List, Optional, Tuple
 import numpy as np
 import torch
 
-from .file_utils import add_start_docstrings
+from .utils import add_start_docstrings
 from .utils.logging import get_logger
 
 
diff --git a/src/transformers/generation_stopping_criteria.py b/src/transformers/generation_stopping_criteria.py
index 538cf330be..70338aa021 100644
--- a/src/transformers/generation_stopping_criteria.py
+++ b/src/transformers/generation_stopping_criteria.py
@@ -6,7 +6,7 @@ from typing import Optional
 
 import torch
 
-from .file_utils import add_start_docstrings
+from .utils import add_start_docstrings
 
 
 STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
diff --git a/src/transformers/generation_tf_logits_process.py b/src/transformers/generation_tf_logits_process.py
index 271957c1ac..22ece088ae 100644
--- a/src/transformers/generation_tf_logits_process.py
+++ b/src/transformers/generation_tf_logits_process.py
@@ -19,8 +19,8 @@ from typing import List
 import numpy as np
 import tensorflow as tf
 
-from .file_utils import add_start_docstrings
 from .tf_utils import set_tensor_by_indices_to_value
+from .utils import add_start_docstrings
 from .utils.logging import get_logger
 
 
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 2a5234004a..95c4b04d32 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -21,7 +21,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
 
-from .file_utils import ModelOutput
 from .generation_tf_logits_process import (
     TFLogitsProcessorList,
     TFMinLengthLogitsProcessor,
@@ -33,7 +32,7 @@ from .generation_tf_logits_process import (
     TFTopPLogitsWarper,
 )
 from .tf_utils import set_tensor_by_indices_to_value, shape_list
-from .utils import logging
+from .utils import ModelOutput, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 6a045a01b1..9618b74ab2 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -23,7 +23,6 @@ import torch
 import torch.distributed as dist
 from torch import nn
 
-from .file_utils import ModelOutput
 from .generation_beam_constraints import Constraint, DisjunctiveConstraint, PhrasalConstraint
 from .generation_beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
 from .generation_logits_process import (
@@ -52,7 +51,7 @@ from .generation_stopping_criteria import (
     validate_stopping_criteria,
 )
 from .pytorch_utils import torch_int_div
-from .utils import logging
+from .utils import ModelOutput, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 951d682c94..79ee733396 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -22,7 +22,8 @@ import PIL.ImageOps
 
 import requests
 
-from .file_utils import _is_torch, is_torch_available
+from .utils import is_torch_available
+from .utils.generic import _is_torch
 
 
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index ded47f8fd1..d66cd3b41c 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -22,8 +22,7 @@ import sys
 import tempfile
 from pathlib import Path
 
-from .file_utils import is_datasets_available
-from .utils import logging
+from .utils import is_datasets_available, logging
 
 
 logger = logging.get_logger(__name__)
@@ -44,9 +43,9 @@ if _has_comet:
     except (ImportError, ValueError):
         _has_comet = False
 
-from .file_utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402
 from .trainer_callback import ProgressCallback, TrainerCallback  # noqa: E402
 from .trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # noqa: E402
+from .utils import ENV_VARS_TRUE_VALUES, is_torch_tpu_available  # noqa: E402
 
 
 # Integration functions:
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index 85153dafc8..0f1d96f33b 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -12,8 +12,8 @@ from tensorflow.keras.callbacks import Callback
 from huggingface_hub import Repository
 
 from . import IntervalStrategy, PreTrainedTokenizerBase
-from .file_utils import get_full_repo_name
 from .modelcard import TrainingSummary
+from .utils import get_full_repo_name
 
 
 logger = logging.getLogger(__name__)
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 560998a6b2..a4ec857b18 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -28,20 +28,6 @@ import yaml
 from huggingface_hub import model_info
 
 from . import __version__
-from .file_utils import (
-    CONFIG_NAME,
-    MODEL_CARD_NAME,
-    TF2_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    cached_path,
-    hf_bucket_url,
-    is_datasets_available,
-    is_offline_mode,
-    is_remote_url,
-    is_tf_available,
-    is_tokenizers_available,
-    is_torch_available,
-)
 from .models.auto.modeling_auto import (
     MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -56,7 +42,21 @@ from .models.auto.modeling_auto import (
     MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
 )
 from .training_args import ParallelMode
-from .utils import logging
+from .utils import (
+    CONFIG_NAME,
+    MODEL_CARD_NAME,
+    TF2_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    cached_path,
+    hf_bucket_url,
+    is_datasets_available,
+    is_offline_mode,
+    is_remote_url,
+    is_tf_available,
+    is_tokenizers_available,
+    is_torch_available,
+    logging,
+)
 
 
 TASK_MAPPING = {
diff --git a/src/transformers/modeling_flax_outputs.py b/src/transformers/modeling_flax_outputs.py
index b7d4bd0aec..cef722f173 100644
--- a/src/transformers/modeling_flax_outputs.py
+++ b/src/transformers/modeling_flax_outputs.py
@@ -16,7 +16,7 @@ from typing import Dict, Optional, Tuple
 import flax
 import jax.numpy as jnp
 
-from .file_utils import ModelOutput
+from .utils import ModelOutput
 
 
 @flax.struct.dataclass
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 71cb817651..ee802b550d 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -30,7 +30,9 @@ from requests import HTTPError
 
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
-from .file_utils import (
+from .generation_flax_utils import FlaxGenerationMixin
+from .modeling_flax_pytorch_utils import load_pytorch_checkpoint_in_flax_state_dict
+from .utils import (
     FLAX_WEIGHTS_NAME,
     WEIGHTS_NAME,
     EntryNotFoundError,
@@ -45,11 +47,9 @@ from .file_utils import (
     hf_bucket_url,
     is_offline_mode,
     is_remote_url,
+    logging,
     replace_return_docstrings,
 )
-from .generation_flax_utils import FlaxGenerationMixin
-from .modeling_flax_pytorch_utils import load_pytorch_checkpoint_in_flax_state_dict
-from .utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index a62ed04a95..cd9d7ce28f 100644
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -17,7 +17,7 @@ from typing import Optional, Tuple
 
 import torch
 
-from .file_utils import ModelOutput
+from .utils import ModelOutput
 
 
 @dataclass
diff --git a/src/transformers/modeling_tf_outputs.py b/src/transformers/modeling_tf_outputs.py
index 5c089d0165..5c74236607 100644
--- a/src/transformers/modeling_tf_outputs.py
+++ b/src/transformers/modeling_tf_outputs.py
@@ -17,7 +17,7 @@ from typing import List, Optional, Tuple
 
 import tensorflow as tf
 
-from .file_utils import ModelOutput
+from .utils import ModelOutput
 
 
 @dataclass
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 94e442beee..d633c6f5c5 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -21,8 +21,7 @@ import re
 
 import numpy
 
-from .file_utils import ExplicitEnum
-from .utils import logging
+from .utils import ExplicitEnum, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 274d9ebf39..d416237005 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -37,7 +37,11 @@ from requests import HTTPError
 from .activations_tf import get_tf_activation
 from .configuration_utils import PretrainedConfig
 from .dynamic_module_utils import custom_object_save
-from .file_utils import (
+from .generation_tf_utils import TFGenerationMixin
+from .modeling_tf_outputs import TFSeq2SeqLMOutput
+from .tf_utils import shape_list
+from .tokenization_utils_base import BatchEncoding
+from .utils import (
     DUMMY_INPUTS,
     TF2_WEIGHTS_NAME,
     WEIGHTS_NAME,
@@ -52,12 +56,8 @@ from .file_utils import (
     hf_bucket_url,
     is_offline_mode,
     is_remote_url,
+    logging,
 )
-from .generation_tf_utils import TFGenerationMixin
-from .modeling_tf_outputs import TFSeq2SeqLMOutput
-from .tf_utils import shape_list
-from .tokenization_utils_base import BatchEncoding
-from .utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5ba874d317..6d5e4a2946 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -32,7 +32,8 @@ from .activations import get_activation
 from .configuration_utils import PretrainedConfig
 from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
 from .dynamic_module_utils import custom_object_save
-from .file_utils import (
+from .generation_utils import GenerationMixin
+from .utils import (
     DUMMY_INPUTS,
     FLAX_WEIGHTS_NAME,
     TF2_WEIGHTS_NAME,
@@ -49,10 +50,9 @@ from .file_utils import (
     hf_bucket_url,
     is_offline_mode,
     is_remote_url,
+    logging,
     replace_return_docstrings,
 )
-from .generation_utils import GenerationMixin
-from .utils import logging
 from .utils.versions import require_version_core
 
 
diff --git a/src/transformers/models/albert/__init__.py b/src/transformers/models/albert/__init__.py
index 6df6ac17f1..72650c265d 100644
--- a/src/transformers/models/albert/__init__.py
+++ b/src/transformers/models/albert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index a54a3874ad..9cf283c70e 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -25,13 +25,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -47,7 +40,14 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_albert import AlbertConfig
 
 
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index c4627d0327..289a72ba1f 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -25,7 +25,6 @@ from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPooling,
@@ -42,7 +41,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_albert import AlbertConfig
 
 
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 95bae2e477..5117d58e16 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -23,14 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPooling,
@@ -53,7 +45,15 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_albert import AlbertConfig
 
 
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index 8deb367851..c2ffcd90b1 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index d0e23beabb..6dace993cd 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 4964ae6bac..6c2297d9f8 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -18,8 +18,7 @@ from collections import OrderedDict
 
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module
-from ...file_utils import copy_func
-from ...utils import logging
+from ...utils import copy_func, logging
 from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings
 
 
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 578c35f60a..69f6b26669 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -21,8 +21,7 @@ from typing import List, Union
 
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module
-from ...file_utils import CONFIG_NAME
-from ...utils import logging
+from ...utils import CONFIG_NAME, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index b59da793de..dc83fc133f 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -23,8 +23,7 @@ from typing import Dict, Optional, Union
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module
 from ...feature_extraction_utils import FeatureExtractionMixin
-from ...file_utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo
-from ...utils import logging
+from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
     CONFIG_MAPPING_NAMES,
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 68b846da96..b51ef9ef31 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -22,9 +22,8 @@ from collections import OrderedDict
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module
 from ...feature_extraction_utils import FeatureExtractionMixin
-from ...file_utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo
 from ...tokenization_utils import TOKENIZER_CONFIG_FILE
-from ...utils import logging
+from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, get_file_from_repo, logging
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
     CONFIG_MAPPING_NAMES,
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ca6724d339..a604841340 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -22,11 +22,10 @@ from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
 from ...configuration_utils import PretrainedConfig
 from ...dynamic_module_utils import get_class_from_dynamic_module
-from ...file_utils import get_file_from_repo, is_sentencepiece_available, is_tokenizers_available
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import get_file_from_repo, is_sentencepiece_available, is_tokenizers_available, logging
 from ..encoder_decoder import EncoderDecoderConfig
 from .auto_factory import _LazyAutoMapping
 from .configuration_auto import (
diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py
index db499e3ce0..9b4ff0d974 100644
--- a/src/transformers/models/bart/__init__.py
+++ b/src/transformers/models/bart/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 4d0ce02ae0..1ad081052c 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -19,10 +19,9 @@ from typing import Any, Mapping, Optional
 
 from ... import PreTrainedTokenizer
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import TensorType, is_torch_available
 from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
 from ...onnx.utils import compute_effective_axis_dimension
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 2b722fd3b4..5dfd14e42a 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -25,13 +25,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -42,7 +35,14 @@ from ...modeling_outputs import (
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_bart import BartConfig
 
 
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index 2bb00abdd9..80743daa9e 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -30,7 +30,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
@@ -47,7 +46,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_bart import BartConfig
 
 
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index bf96f1c210..9e4d87d957 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -22,13 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -48,7 +41,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_bart import BartConfig
 
 
diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py
index f7ec14c5dd..fcea72b02d 100644
--- a/src/transformers/models/barthez/__init__.py
+++ b/src/transformers/models/barthez/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available
 
 
 _import_structure = {}
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index e5e7b96a3a..b8b6813c1e 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/bartpho/__init__.py b/src/transformers/models/bartpho/__init__.py
index e12b418036..a7260368aa 100644
--- a/src/transformers/models/bartpho/__init__.py
+++ b/src/transformers/models/bartpho/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available
+from ...utils import _LazyModule, is_sentencepiece_available
 
 
 _import_structure = {}
diff --git a/src/transformers/models/beit/__init__.py b/src/transformers/models/beit/__init__.py
index 52d5b7e2b9..319fb2880a 100644
--- a/src/transformers/models/beit/__init__.py
+++ b/src/transformers/models/beit/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_flax_available, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/beit/feature_extraction_beit.py b/src/transformers/models/beit/feature_extraction_beit.py
index 2567bbee56..bbf54af266 100644
--- a/src/transformers/models/beit/feature_extraction_beit.py
+++ b/src/transformers/models/beit/feature_extraction_beit.py
@@ -20,7 +20,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -28,7 +27,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index d699af419a..68584476d1 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -26,12 +26,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -40,7 +34,13 @@ from ...modeling_outputs import (
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_beit import BeitConfig
 
 
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 1ba50f8758..ab7eff4d63 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -25,7 +25,6 @@ import jax.numpy as jnp
 from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPooling,
@@ -38,6 +37,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from .configuration_beit import BeitConfig
 
 
diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py
index 8f3ea0336f..1724d406ea 100644
--- a/src/transformers/models/bert/__init__.py
+++ b/src/transformers/models/bert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 6e942ebcf3..81642e8a59 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -29,13 +29,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -53,7 +46,14 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_bert import BertConfig
 
 
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 03f7309864..98dfd7b7a4 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -25,7 +25,6 @@ from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPooling,
@@ -43,7 +42,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_bert import BertConfig
 
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 5aaebaea5b..cc9770e63e 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -24,15 +24,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithPastAndCrossAttentions,
     TFBaseModelOutputWithPoolingAndCrossAttentions,
@@ -59,7 +50,16 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    DUMMY_INPUTS,
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_bert import BertConfig
 
 
diff --git a/src/transformers/models/bert_generation/__init__.py b/src/transformers/models/bert_generation/__init__.py
index 32219a9649..980f8cd7a4 100644
--- a/src/transformers/models/bert_generation/__init__.py
+++ b/src/transformers/models/bert_generation/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_torch_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 631d73784b..2123faee5c 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -20,15 +20,15 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from ..bert.modeling_bert import BertEncoder
 from .configuration_bert_generation import BertGenerationConfig
 
diff --git a/src/transformers/models/bert_japanese/__init__.py b/src/transformers/models/bert_japanese/__init__.py
index 0026a5803d..78d70a7f6e 100644
--- a/src/transformers/models/bert_japanese/__init__.py
+++ b/src/transformers/models/bert_japanese/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ...utils import _LazyModule
 
 
 _import_structure = {
diff --git a/src/transformers/models/bertweet/__init__.py b/src/transformers/models/bertweet/__init__.py
index 675031d9fb..c9140bd49d 100644
--- a/src/transformers/models/bertweet/__init__.py
+++ b/src/transformers/models/bertweet/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ...utils import _LazyModule
 
 
 _import_structure = {
diff --git a/src/transformers/models/big_bird/__init__.py b/src/transformers/models/big_bird/__init__.py
index cdff90c8da..ec7d2a4c55 100644
--- a/src/transformers/models/big_bird/__init__.py
+++ b/src/transformers/models/big_bird/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 67e1c1f908..88d055619c 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -28,13 +28,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -45,7 +38,14 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_big_bird import BigBirdConfig
 
 
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index d23d25494e..d9ec686e85 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -25,7 +25,6 @@ from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPooling,
@@ -41,7 +40,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_big_bird import BigBirdConfig
 
 
diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
index 3d0aa94627..c645fb0059 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/bigbird_pegasus/__init__.py b/src/transformers/models/bigbird_pegasus/__init__.py
index 1cee0664ff..fd5b3671e2 100644
--- a/src/transformers/models/bigbird_pegasus/__init__.py
+++ b/src/transformers/models/bigbird_pegasus/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index dc4a099ba2..2f624d362b 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -26,13 +26,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -43,7 +36,14 @@ from ...modeling_outputs import (
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_bigbird_pegasus import BigBirdPegasusConfig
 
 
diff --git a/src/transformers/models/blenderbot/__init__.py b/src/transformers/models/blenderbot/__init__.py
index ec04b1c076..fcf36b69a9 100644
--- a/src/transformers/models/blenderbot/__init__.py
+++ b/src/transformers/models/blenderbot/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index cdaa21521c..0984a842aa 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -28,12 +28,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -42,7 +36,13 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from ..blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel
 from .configuration_blenderbot import BlenderbotConfig
 
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index cf0730f45b..682ed9dfa5 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -30,7 +30,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
@@ -45,7 +44,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_blenderbot import BlenderbotConfig
 
 
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index 34a5f972ab..347d83b649 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -23,13 +23,6 @@ from typing import List, Optional, Tuple, Union
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -48,7 +41,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_blenderbot import BlenderbotConfig
 
 
diff --git a/src/transformers/models/blenderbot_small/__init__.py b/src/transformers/models/blenderbot_small/__init__.py
index 0f5a2a5bfb..56c1aa53f7 100644
--- a/src/transformers/models/blenderbot_small/__init__.py
+++ b/src/transformers/models/blenderbot_small/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 8a6c863e63..81a8574604 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -26,12 +26,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -40,7 +34,13 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_blenderbot_small import BlenderbotSmallConfig
 
 
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index bad2ca5e65..dc3155535e 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -31,7 +31,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
@@ -46,7 +45,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_blenderbot_small import BlenderbotSmallConfig
 
 
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 69d6233f02..5d3ac0cea7 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -22,13 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -47,7 +40,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_blenderbot_small import BlenderbotSmallConfig
 
 
diff --git a/src/transformers/models/byt5/__init__.py b/src/transformers/models/byt5/__init__.py
index 83ea9bba61..ec9a03212f 100644
--- a/src/transformers/models/byt5/__init__.py
+++ b/src/transformers/models/byt5/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ...utils import _LazyModule
 
 
 _import_structure = {
diff --git a/src/transformers/models/camembert/__init__.py b/src/transformers/models/camembert/__init__.py
index 7be6e7fbb6..fccb4c49c9 100644
--- a/src/transformers/models/camembert/__init__.py
+++ b/src/transformers/models/camembert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_sentencepiece_available,
     is_tf_available,
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index b1a10319d6..8d38e7c662 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch CamemBERT model."""
 
-from ...file_utils import add_start_docstrings
-from ...utils import logging
+from ...utils import add_start_docstrings, logging
 from ..roberta.modeling_roberta import (
     RobertaForCausalLM,
     RobertaForMaskedLM,
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index 9e04d95be6..b773bb761d 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ TF 2.0 CamemBERT model."""
 
-from ...file_utils import add_start_docstrings
-from ...utils import logging
+from ...utils import add_start_docstrings, logging
 from ..roberta.modeling_tf_roberta import (
     TFRobertaForCausalLM,
     TFRobertaForMaskedLM,
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index d9dfa5f417..cbffa75a28 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/canine/__init__.py b/src/transformers/models/canine/__init__.py
index 1773dbf0ac..1d24a01b14 100644
--- a/src/transformers/models/canine/__init__.py
+++ b/src/transformers/models/canine/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index 2e0c05f3eb..3a707ff05a 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -27,7 +27,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutput,
     ModelOutput,
@@ -42,7 +41,7 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_canine import CanineConfig
 
 
diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py
index 9c0e8d6267..67e6841e6d 100644
--- a/src/transformers/models/clip/__init__.py
+++ b/src/transformers/models/clip/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_tf_available,
diff --git a/src/transformers/models/clip/feature_extraction_clip.py b/src/transformers/models/clip/feature_extraction_clip.py
index a480ce4580..99ace19c68 100644
--- a/src/transformers/models/clip/feature_extraction_clip.py
+++ b/src/transformers/models/clip/feature_extraction_clip.py
@@ -20,9 +20,8 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index cf13926ebd..a5d556ff5e 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -23,15 +23,15 @@ import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
 
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index 237d29fe71..5ceecafd6f 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -24,7 +24,6 @@ from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import ModelOutput, add_start_docstrings
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling
 from ...modeling_flax_utils import (
     ACT2FN,
@@ -32,7 +31,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import ModelOutput, add_start_docstrings, logging
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
 
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index eac593fb54..1031eb8067 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -23,12 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
 
 # Public API
@@ -41,7 +35,13 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 
 
diff --git a/src/transformers/models/convbert/__init__.py b/src/transformers/models/convbert/__init__.py
index 4195dda68a..77665acfea 100644
--- a/src/transformers/models/convbert/__init__.py
+++ b/src/transformers/models/convbert/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index 5a579722d3..d187b69978 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -26,7 +26,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN, get_activation
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutputWithCrossAttentions,
     MaskedLMOutput,
@@ -42,7 +41,7 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_convbert import ConvBertConfig
 
 
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index 0c4d265dcd..dcd3c6ca0a 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -18,12 +18,6 @@
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFMaskedLMOutput,
@@ -45,7 +39,13 @@ from ...modeling_tf_utils import (
     keras_serializable,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_convbert import ConvBertConfig
 
 
diff --git a/src/transformers/models/convnext/__init__.py b/src/transformers/models/convnext/__init__.py
index a627c462e9..b488da389f 100644
--- a/src/transformers/models/convnext/__init__.py
+++ b/src/transformers/models/convnext/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/convnext/feature_extraction_convnext.py b/src/transformers/models/convnext/feature_extraction_convnext.py
index 860bda96b6..89336b3ed9 100644
--- a/src/transformers/models/convnext/feature_extraction_convnext.py
+++ b/src/transformers/models/convnext/feature_extraction_convnext.py
@@ -20,7 +20,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -28,7 +27,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index 3d53a8fe72..78f177aed6 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -23,14 +23,14 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_convnext import ConvNextConfig
 
 
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index 957811ae9e..712c91a8cf 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -21,7 +21,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
 from ...modeling_tf_utils import (
     TFModelInputType,
@@ -31,7 +30,7 @@ from ...modeling_tf_utils import (
     keras_serializable,
     unpack_inputs,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_convnext import ConvNextConfig
 
 
diff --git a/src/transformers/models/cpm/__init__.py b/src/transformers/models/cpm/__init__.py
index e160402e24..06f8a91261 100644
--- a/src/transformers/models/cpm/__init__.py
+++ b/src/transformers/models/cpm/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available
 
 
 _import_structure = {}
diff --git a/src/transformers/models/ctrl/__init__.py b/src/transformers/models/ctrl/__init__.py
index fafc6f0c7d..efd342331f 100644
--- a/src/transformers/models/ctrl/__init__.py
+++ b/src/transformers/models/ctrl/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index 97d73d8162..4c2d9c3ef4 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -22,10 +22,9 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput
 from ...modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ctrl import CTRLConfig
 
 
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index 4eee0d3695..a6848ae0e0 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -21,7 +21,6 @@ from typing import Tuple
 import numpy as np
 import tensorflow as tf
 
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast, TFSequenceClassifierOutput
 from ...modeling_tf_utils import (
     TFCausalLanguageModelingLoss,
@@ -33,7 +32,7 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ctrl import CTRLConfig
 
 
diff --git a/src/transformers/models/data2vec/__init__.py b/src/transformers/models/data2vec/__init__.py
index d67c90a2d1..d93c64d1e3 100644
--- a/src/transformers/models/data2vec/__init__.py
+++ b/src/transformers/models/data2vec/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index d23c290ea7..9891d4ddbb 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -27,16 +27,16 @@ from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
 )
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
-from ...utils import logging
 from .configuration_data2vec_audio import Data2VecAudioConfig
 
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index df43db2c7b..9759848b6b 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -24,12 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN, gelu
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -46,7 +40,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_data2vec_text import Data2VecTextConfig
 
 
diff --git a/src/transformers/models/deberta/__init__.py b/src/transformers/models/deberta/__init__.py
index 1126704b9e..38a3a22d51 100644
--- a/src/transformers/models/deberta/__init__.py
+++ b/src/transformers/models/deberta/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 8243353e4c..b9b4bb6f7b 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -23,7 +23,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -33,7 +32,7 @@ from ...modeling_outputs import (
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import softmax_backward_data
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta import DebertaConfig
 
 
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 86262bb80d..4cc98f320b 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -22,7 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFMaskedLMOutput,
@@ -41,7 +40,7 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta import DebertaConfig
 
 
diff --git a/src/transformers/models/deberta_v2/__init__.py b/src/transformers/models/deberta_v2/__init__.py
index 1b2cb9e335..1178e41301 100644
--- a/src/transformers/models/deberta_v2/__init__.py
+++ b/src/transformers/models/deberta_v2/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 2aee35fdd9..5185095647 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -24,7 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -34,7 +33,7 @@ from ...modeling_outputs import (
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import softmax_backward_data
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta_v2 import DebertaV2Config
 
 
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 314575ea3b..cfba4923ba 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -21,7 +21,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFMaskedLMOutput,
@@ -40,7 +39,7 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_deberta_v2 import DebertaV2Config
 
 
diff --git a/src/transformers/models/deit/__init__.py b/src/transformers/models/deit/__init__.py
index a9fddc5af8..bcded61602 100644
--- a/src/transformers/models/deit/__init__.py
+++ b/src/transformers/models/deit/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/deit/feature_extraction_deit.py b/src/transformers/models/deit/feature_extraction_deit.py
index acb83a3c57..d5fb9fa768 100644
--- a/src/transformers/models/deit/feature_extraction_deit.py
+++ b/src/transformers/models/deit/feature_extraction_deit.py
@@ -20,7 +20,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -28,7 +27,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index 91b673ff6f..f5443ed4c0 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -26,16 +26,16 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
 from .configuration_deit import DeiTConfig
 
 
diff --git a/src/transformers/models/detr/__init__.py b/src/transformers/models/detr/__init__.py
index bbde97ccbc..c716512824 100644
--- a/src/transformers/models/detr/__init__.py
+++ b/src/transformers/models/detr/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_timm_available, is_vision_available
+from ...utils import _LazyModule, is_timm_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py
index 2a846fc763..ff69c8430e 100644
--- a/src/transformers/models/detr/feature_extraction_detr.py
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -23,9 +23,8 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType, is_torch_available
 from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 if is_torch_available():
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index f1a0e6301e..54ce7fc8ce 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -24,20 +24,20 @@ import torch
 from torch import Tensor, nn
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_scipy_available,
     is_timm_available,
     is_vision_available,
+    logging,
     replace_return_docstrings,
     requires_backends,
 )
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
-from ...utils import logging
 from .configuration_detr import DetrConfig
 
 
diff --git a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
index f588a2fde8..fbf3401292 100644
--- a/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/dialogpt/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
@@ -17,7 +17,7 @@ import os
 
 import torch
 
-from transformers.file_utils import WEIGHTS_NAME
+from transformers.utils import WEIGHTS_NAME
 
 
 DIALOGPT_MODELS = ["small", "medium", "large"]
diff --git a/src/transformers/models/distilbert/__init__.py b/src/transformers/models/distilbert/__init__.py
index a6215432af..fd2e7e6a2d 100644
--- a/src/transformers/models/distilbert/__init__.py
+++ b/src/transformers/models/distilbert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 9d4961b9ac..76869eda88 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -31,12 +31,6 @@ from transformers.configuration_utils import PretrainedConfig
 
 from ...activations import get_activation
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -51,7 +45,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_distilbert import DistilBertConfig
 
 
diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py
index fed8a64d9b..48a486f3cd 100644
--- a/src/transformers/models/distilbert/modeling_flax_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py
@@ -24,7 +24,6 @@ import jax.numpy as jnp
 from flax.core.frozen_dict import FrozenDict
 from jax import lax
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxMaskedLMOutput,
@@ -34,7 +33,7 @@ from ...modeling_flax_outputs import (
     FlaxTokenClassifierOutput,
 )
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_distilbert import DistilBertConfig
 
 
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 2b0ccf2f80..1bd9ee92f3 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -23,12 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFMaskedLMOutput,
@@ -50,7 +44,13 @@ from ...modeling_tf_utils import (
     keras_serializable,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_distilbert import DistilBertConfig
 
 
diff --git a/src/transformers/models/dpr/__init__.py b/src/transformers/models/dpr/__init__.py
index 4c3074543f..5ee5962258 100644
--- a/src/transformers/models/dpr/__init__.py
+++ b/src/transformers/models/dpr/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index b2dde6a4e8..6b1fd35bb6 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -21,15 +21,15 @@ from typing import Optional, Tuple, Union
 import torch
 from torch import Tensor, nn
 
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutputWithPooling
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from ..bert.modeling_bert import BertEncoder, BertModel
 from .configuration_dpr import DPRConfig
 
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 899ebda6c1..a47d975620 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -20,15 +20,15 @@ from typing import Optional, Tuple, Union
 
 import tensorflow as tf
 
-from ...file_utils import (
+from ...modeling_tf_outputs import TFBaseModelOutputWithPooling
+from ...modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list, unpack_inputs
+from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_tf_outputs import TFBaseModelOutputWithPooling
-from ...modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list, unpack_inputs
-from ...utils import logging
 from ..bert.modeling_tf_bert import TFBertMainLayer
 from .configuration_dpr import DPRConfig
 
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index 9ce7b81be8..b47776f057 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -18,9 +18,8 @@
 import collections
 from typing import List, Optional, Union
 
-from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import logging
+from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
 from ..bert.tokenization_bert import BertTokenizer
 
 
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 8ea21aeea1..1b9e9c93fb 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -18,9 +18,8 @@
 import collections
 from typing import List, Optional, Union
 
-from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import logging
+from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
 from ..bert.tokenization_bert_fast import BertTokenizerFast
 from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
 
diff --git a/src/transformers/models/electra/__init__.py b/src/transformers/models/electra/__init__.py
index a8f1ec5db7..ad818da549 100644
--- a/src/transformers/models/electra/__init__.py
+++ b/src/transformers/models/electra/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index faa68917dd..740edb309f 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -26,13 +26,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN, get_activation
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithCrossAttentions,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -50,7 +43,14 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_electra import ElectraConfig
 
 
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 7f277ede35..de775e1136 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -26,7 +26,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxMaskedLMOutput,
@@ -42,7 +41,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_electra import ElectraConfig
 
 
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index ba10091237..f442d7de48 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -23,15 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithPastAndCrossAttentions,
     TFMaskedLMOutput,
@@ -54,7 +45,16 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    DUMMY_INPUTS,
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_electra import ElectraConfig
 
 
diff --git a/src/transformers/models/encoder_decoder/__init__.py b/src/transformers/models/encoder_decoder/__init__.py
index 400ef62282..2e1e7e1eb6 100644
--- a/src/transformers/models/encoder_decoder/__init__.py
+++ b/src/transformers/models/encoder_decoder/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 14c751e3bb..1a5c17af02 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -22,10 +22,9 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import Seq2SeqLMOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
 from .configuration_encoder_decoder import EncoderDecoderConfig
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index efde18e13c..9b07cddd36 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -25,10 +25,9 @@ from flax.core.frozen_dict import FrozenDict, unfreeze
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
 from ...modeling_flax_utils import FlaxPreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
 from .configuration_encoder_decoder import EncoderDecoderConfig
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 07d8e812f2..831c47bd41 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -22,17 +22,17 @@ from typing import Optional
 import tensorflow as tf
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import (
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
+from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing
+from ...tf_utils import shape_list
+from ...utils import (
     DUMMY_INPUTS,
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
-from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing
-from ...tf_utils import shape_list
-from ...utils import logging
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM
 from .configuration_encoder_decoder import EncoderDecoderConfig
diff --git a/src/transformers/models/flaubert/__init__.py b/src/transformers/models/flaubert/__init__.py
index a2b6ed1ca8..fa4e31eeb6 100644
--- a/src/transformers/models/flaubert/__init__.py
+++ b/src/transformers/models/flaubert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index c107042b0c..35f72e9b35 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -22,9 +22,8 @@ import torch
 from packaging import version
 from torch import nn
 
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import BaseModelOutput
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from ..xlm.modeling_xlm import (
     XLMForMultipleChoice,
     XLMForQuestionAnswering,
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index a425bcfd52..427291aad2 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -26,12 +26,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import TFBaseModelOutput
 from ...modeling_tf_utils import (
     TFPreTrainedModel,
@@ -41,7 +35,13 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from ..xlm.modeling_tf_xlm import (
     TFXLMForMultipleChoice,
     TFXLMForQuestionAnsweringSimple,
diff --git a/src/transformers/models/fnet/__init__.py b/src/transformers/models/fnet/__init__.py
index 8a163e19be..7b09e97ab1 100644
--- a/src/transformers/models/fnet/__init__.py
+++ b/src/transformers/models/fnet/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 3906bdfdf2..e290373366 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -25,19 +25,13 @@ from packaging import version
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...file_utils import is_scipy_available
+from ...utils import is_scipy_available
 
 
 if is_scipy_available():
     from scipy import linalg
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -50,7 +44,13 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_fnet import FNetConfig
 
 
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
index 0c523a2259..7cbd339c8b 100644
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/fsmt/__init__.py b/src/transformers/models/fsmt/__init__.py
index 9e633dbc7f..034c2c8d2a 100644
--- a/src/transformers/models/fsmt/__init__.py
+++ b/src/transformers/models/fsmt/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
index 2470492ac7..7257f7faa2 100755
--- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
@@ -32,10 +32,9 @@ from fairseq import hub_utils
 from fairseq.data.dictionary import Dictionary
 
 from transformers import FSMTConfig, FSMTForConditionalGeneration
-from transformers.file_utils import WEIGHTS_NAME
 from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
 from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
-from transformers.utils import logging
+from transformers.utils import WEIGHTS_NAME, logging
 
 
 logging.set_verbosity_warning()
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index 90289747a8..9e55c2937e 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -37,13 +37,6 @@ from torch.nn import CrossEntropyLoss, LayerNorm
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -51,7 +44,14 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_fsmt import FSMTConfig
 
 
diff --git a/src/transformers/models/funnel/__init__.py b/src/transformers/models/funnel/__init__.py
index e520acae4f..b9c6b9608d 100644
--- a/src/transformers/models/funnel/__init__.py
+++ b/src/transformers/models/funnel/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 429d1f1e18..d8e844d4ea 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -24,13 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -40,7 +33,14 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_funnel import FunnelConfig
 
 
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index cbd61fc451..fe8a81f9bf 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -22,14 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFMaskedLMOutput,
@@ -51,7 +43,15 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_funnel import FunnelConfig
 
 
diff --git a/src/transformers/models/glpn/__init__.py b/src/transformers/models/glpn/__init__.py
index 22fe3a7592..e758224d7d 100644
--- a/src/transformers/models/glpn/__init__.py
+++ b/src/transformers/models/glpn/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/glpn/feature_extraction_glpn.py b/src/transformers/models/glpn/feature_extraction_glpn.py
index a729cce3f2..d0205be9e3 100644
--- a/src/transformers/models/glpn/feature_extraction_glpn.py
+++ b/src/transformers/models/glpn/feature_extraction_glpn.py
@@ -20,9 +20,8 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import ImageFeatureExtractionMixin, ImageInput, is_torch_tensor
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py
index 0eee27a286..9b0429bb35 100755
--- a/src/transformers/models/glpn/modeling_glpn.py
+++ b/src/transformers/models/glpn/modeling_glpn.py
@@ -23,15 +23,15 @@ import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
 from .configuration_glpn import GLPNConfig
 
 
diff --git a/src/transformers/models/gpt2/__init__.py b/src/transformers/models/gpt2/__init__.py
index d439d4bc39..a5d5920f88 100644
--- a/src/transformers/models/gpt2/__init__.py
+++ b/src/transformers/models/gpt2/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index 7bc720fa88..4d8b465afa 100755
--- a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -20,8 +20,7 @@ import argparse
 import torch
 
 from transformers import GPT2Config, GPT2Model, load_tf_weights_in_gpt2
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
-from transformers.utils import logging
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py
index c7867a6400..1a1f5cc1a5 100644
--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -23,13 +23,12 @@ from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutputWithPastAndCrossAttentions,
     FlaxCausalLMOutputWithCrossAttentions,
 )
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_gpt2 import GPT2Config
 
 
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 7f477c1da0..8fa770b53c 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -34,13 +34,6 @@ else:
     is_amp_available = False
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -54,7 +47,14 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_conv1d_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_gpt2 import GPT2Config
 
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index 23705dc12d..b5ed0c77eb 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -23,14 +23,6 @@ import tensorflow as tf
 from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithPastAndCrossAttentions,
     TFCausalLMOutputWithCrossAttentions,
@@ -49,7 +41,15 @@ from ...modeling_tf_utils import (
     keras_serializable,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_gpt2 import GPT2Config
 
 
diff --git a/src/transformers/models/gpt_neo/__init__.py b/src/transformers/models/gpt_neo/__init__.py
index 70793af636..d039b6f439 100644
--- a/src/transformers/models/gpt_neo/__init__.py
+++ b/src/transformers/models/gpt_neo/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
index 05779a259f..d64e160c6a 100644
--- a/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_flax_gpt_neo.py
@@ -24,10 +24,9 @@ from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_gpt_neo import GPTNeoConfig
 
 
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 706a1996a4..8fe1808a27 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -24,7 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -33,7 +32,7 @@ from ...modeling_outputs import (
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_gpt_neo import GPTNeoConfig
 
 
diff --git a/src/transformers/models/gptj/__init__.py b/src/transformers/models/gptj/__init__.py
index 076426dc4b..cbebc85279 100644
--- a/src/transformers/models/gptj/__init__.py
+++ b/src/transformers/models/gptj/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py
index ebee17632f..d9bedbe264 100644
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@@ -26,10 +26,9 @@ from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_gptj import GPTJConfig
 
 
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index a74c74b8fc..974b3aabed 100755
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -22,7 +22,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -30,7 +29,7 @@ from ...modeling_outputs import (
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_gptj import GPTJConfig
 
diff --git a/src/transformers/models/herbert/__init__.py b/src/transformers/models/herbert/__init__.py
index d5426c8866..4cd458b4e8 100644
--- a/src/transformers/models/herbert/__init__.py
+++ b/src/transformers/models/herbert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available
+from ...utils import _LazyModule, is_tokenizers_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/hubert/__init__.py b/src/transformers/models/hubert/__init__.py
index 5b24698c38..59f848c118 100644
--- a/src/transformers/models/hubert/__init__.py
+++ b/src/transformers/models/hubert/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 9714ddbf5f..4147f60683 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -26,16 +26,16 @@ from torch.nn import CrossEntropyLoss
 from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import torch_int_div
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_hubert import HubertConfig
 
 
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 4c824ef534..be0508937e 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -21,17 +21,17 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
 from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable
 from ...tf_utils import shape_list
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_hubert import HubertConfig
 
 
diff --git a/src/transformers/models/ibert/__init__.py b/src/transformers/models/ibert/__init__.py
index 0701a44c06..e941b88f25 100644
--- a/src/transformers/models/ibert/__init__.py
+++ b/src/transformers/models/ibert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py
index 9fea3e12e3..44ca8d1223 100644
--- a/src/transformers/models/ibert/modeling_ibert.py
+++ b/src/transformers/models/ibert/modeling_ibert.py
@@ -25,7 +25,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import gelu
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -36,7 +35,7 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ibert import IBertConfig
 from .quant_modules import IntGELU, IntLayerNorm, IntSoftmax, QuantAct, QuantEmbedding, QuantLinear
 
diff --git a/src/transformers/models/imagegpt/__init__.py b/src/transformers/models/imagegpt/__init__.py
index 2fbc923b21..f82d0cb989 100644
--- a/src/transformers/models/imagegpt/__init__.py
+++ b/src/transformers/models/imagegpt/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
index 7fb6813f7a..0212bd485b 100644
--- a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
+++ b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
@@ -20,8 +20,7 @@ import argparse
 import torch
 
 from transformers import ImageGPTConfig, ImageGPTForCausalLM, load_tf_weights_in_imagegpt
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
-from transformers.utils import logging
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/imagegpt/feature_extraction_imagegpt.py b/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
index 223587f1c7..0926cde8dd 100644
--- a/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
+++ b/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
@@ -20,9 +20,8 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index d116423463..c2f5f0ae12 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -33,14 +33,13 @@ else:
     is_amp_available = False
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import Conv1D, PreTrainedModel, find_pruneable_heads_and_indices, prune_conv1d_layer
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_imagegpt import ImageGPTConfig
 
 
diff --git a/src/transformers/models/layoutlm/__init__.py b/src/transformers/models/layoutlm/__init__.py
index b20cfdfa8f..b77edddc4d 100644
--- a/src/transformers/models/layoutlm/__init__.py
+++ b/src/transformers/models/layoutlm/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 from .configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig
 from .tokenization_layoutlm import LayoutLMTokenizer
 
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 85c6db1905..e0f7a34f3f 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -24,7 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -38,7 +37,7 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlm import LayoutLMConfig
 
 
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 10ea7ea675..6fcab3e546 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -22,7 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithPastAndCrossAttentions,
     TFBaseModelOutputWithPoolingAndCrossAttentions,
@@ -41,7 +40,7 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_layoutlm import LayoutLMConfig
 
 
diff --git a/src/transformers/models/layoutlmv2/__init__.py b/src/transformers/models/layoutlmv2/__init__.py
index c5b628cb00..9f7a8dae39 100644
--- a/src/transformers/models/layoutlmv2/__init__.py
+++ b/src/transformers/models/layoutlmv2/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
index 430cb9dd74..6ab28ec222 100644
--- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -15,8 +15,7 @@
 """ LayoutLMv2 model configuration"""
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import is_detectron2_available
-from ...utils import logging
+from ...utils import is_detectron2_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
index e8c21b51cc..2bf476239c 100644
--- a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
@@ -22,9 +22,8 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType, is_pytesseract_available, requires_backends
 from ...image_utils import ImageFeatureExtractionMixin, is_torch_tensor
-from ...utils import logging
+from ...utils import TensorType, is_pytesseract_available, logging, requires_backends
 
 
 # soft dependency
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 2be3b36714..203e56e0da 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -24,13 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_detectron2_available,
-    replace_return_docstrings,
-    requires_backends,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -39,7 +32,14 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
-from ...utils import logging
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_detectron2_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
 from .configuration_layoutlmv2 import LayoutLMv2Config
 
 
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 4277d3a1b2..449eb4770a 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -17,9 +17,9 @@ Processor class for LayoutLMv2.
 """
 from typing import List, Optional, Union
 
-from ...file_utils import TensorType
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
 
 
 class LayoutLMv2Processor(ProcessorMixin):
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index 8384ac9ca8..9aa0e4206e 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -20,7 +20,6 @@ import sys
 import unicodedata
 from typing import Dict, List, Optional, Tuple, Union
 
-from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -31,7 +30,7 @@ from ...tokenization_utils_base import (
     TextInputPair,
     TruncationStrategy,
 )
-from ...utils import logging
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index 0292f1a580..e0dfe36ab0 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -22,7 +22,6 @@ from typing import Dict, List, Optional, Tuple, Union
 
 from tokenizers import normalizers
 
-from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
     BatchEncoding,
@@ -33,7 +32,7 @@ from ...tokenization_utils_base import (
     TruncationStrategy,
 )
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
 from .tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, LayoutLMv2Tokenizer
 
 
diff --git a/src/transformers/models/layoutxlm/__init__.py b/src/transformers/models/layoutxlm/__init__.py
index 2740f9fd8a..c9459aff20 100644
--- a/src/transformers/models/layoutxlm/__init__.py
+++ b/src/transformers/models/layoutxlm/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_sentencepiece_available,
     is_tokenizers_available,
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 5bd1dffb5c..99245ccc17 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -17,9 +17,9 @@ Processor class for LayoutXLM.
 """
 from typing import List, Optional, Union
 
-from ...file_utils import TensorType
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
 
 
 class LayoutXLMProcessor(ProcessorMixin):
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 79608ac37a..8fded39284 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -22,7 +22,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import sentencepiece as spm
 from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
 
-from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -33,7 +32,7 @@ from ...tokenization_utils_base import (
     TextInputPair,
     TruncationStrategy,
 )
-from ...utils import logging
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
 from ..xlm_roberta.tokenization_xlm_roberta import (
     PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
     PRETRAINED_VOCAB_FILES_MAP,
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index ca06657958..35b4383877 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -21,7 +21,6 @@ from typing import Dict, List, Optional, Tuple, Union
 
 from transformers.models.layoutlmv2.tokenization_layoutlmv2 import LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING
 
-from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -33,7 +32,7 @@ from ...tokenization_utils_base import (
     TruncationStrategy,
 )
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
 from ..xlm_roberta.tokenization_xlm_roberta_fast import (
     PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
     PRETRAINED_VOCAB_FILES_MAP,
diff --git a/src/transformers/models/led/__init__.py b/src/transformers/models/led/__init__.py
index 9a7cb5e863..d60800f981 100644
--- a/src/transformers/models/led/__init__.py
+++ b/src/transformers/models/led/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 7cc926275b..11b9d8f38d 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -26,14 +26,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     Seq2SeqLMOutput,
@@ -42,7 +34,15 @@ from ...modeling_outputs import (
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_led import LEDConfig
 
 
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 1e9a05bb6d..5564a91dd1 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -22,13 +22,6 @@ from typing import List, Optional, Tuple, Union
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions
 
 # Public API
@@ -41,7 +34,14 @@ from ...modeling_tf_utils import (
     keras_serializable,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_led import LEDConfig
 
 
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index a55a5a12db..84232ef517 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -16,9 +16,8 @@
 
 from typing import Dict, Optional, Union
 
-from ...file_utils import PaddingStrategy
 from ...tokenization_utils_base import BatchEncoding, EncodedInput
-from ...utils import logging
+from ...utils import PaddingStrategy, logging
 from ..bart.tokenization_bart import BartTokenizer
 
 
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index 5059f634c1..5bcd5d7895 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -16,9 +16,8 @@
 
 from typing import Dict, Optional, Union
 
-from ...file_utils import PaddingStrategy
 from ...tokenization_utils_base import BatchEncoding, EncodedInput
-from ...utils import logging
+from ...utils import PaddingStrategy, logging
 from ..bart.tokenization_bart_fast import BartTokenizerFast
 from .tokenization_led import LEDTokenizer
 
diff --git a/src/transformers/models/longformer/__init__.py b/src/transformers/models/longformer/__init__.py
index dc65fe286a..329b8f1cdf 100644
--- a/src/transformers/models/longformer/__init__.py
+++ b/src/transformers/models/longformer/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 6294d1a567..e6b96d6934 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -24,20 +24,20 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN, gelu
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_utils import (
     PreTrainedModel,
     apply_chunking_to_forward,
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_longformer import LongformerConfig
 
 
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 13af0a074a..df1f130de4 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -22,13 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_utils import (
     TFMaskedLanguageModelingLoss,
     TFModelInputType,
@@ -42,7 +35,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_longformer import LongformerConfig
 
 
diff --git a/src/transformers/models/luke/__init__.py b/src/transformers/models/luke/__init__.py
index 29b963a834..d18d016b50 100644
--- a/src/transformers/models/luke/__init__.py
+++ b/src/transformers/models/luke/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 7f4468eb08..2cc7bd6981 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -23,15 +23,15 @@ import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN, gelu
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
+from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
-from ...utils import logging
 from .configuration_luke import LukeConfig
 
 
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index a4033cda18..20e12a01e2 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -22,7 +22,6 @@ from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 
 from ... import RobertaTokenizer
-from ...file_utils import add_end_docstrings, is_tf_available, is_torch_available
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
     AddedToken,
@@ -37,7 +36,7 @@ from ...tokenization_utils_base import (
     _is_torch,
     to_py_obj,
 )
-from ...utils import logging
+from ...utils import add_end_docstrings, is_tf_available, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/lxmert/__init__.py b/src/transformers/models/lxmert/__init__.py
index 1d98d2a10c..38d9d5e67e 100644
--- a/src/transformers/models/lxmert/__init__.py
+++ b/src/transformers/models/lxmert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 8d8ebb0e41..6676fd09a7 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -26,15 +26,15 @@ from torch import nn
 from torch.nn import CrossEntropyLoss, SmoothL1Loss
 
 from ...activations import ACT2FN, gelu
-from ...file_utils import (
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_lxmert import LxmertConfig
 
 
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index 0928611386..0a6e1b33b0 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -23,15 +23,15 @@ from typing import Dict, Optional, Tuple
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
+from ...modeling_tf_utils import TFPreTrainedModel, get_initializer, input_processing, keras_serializable, shape_list
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_tf_utils import TFPreTrainedModel, get_initializer, input_processing, keras_serializable, shape_list
-from ...utils import logging
 from .configuration_lxmert import LxmertConfig
 
 
diff --git a/src/transformers/models/m2m_100/__init__.py b/src/transformers/models/m2m_100/__init__.py
index 0f24fd3981..81d664d0f7 100644
--- a/src/transformers/models/m2m_100/__init__.py
+++ b/src/transformers/models/m2m_100/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py
index 180950f8c7..f80e354695 100644
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -18,10 +18,9 @@ from typing import Any, Mapping, Optional
 
 from ... import PreTrainedTokenizer
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import TensorType, is_torch_available
 from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
 from ...onnx.utils import compute_effective_axis_dimension
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index b2ba0de455..36dc9e11ce 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -24,13 +24,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -38,7 +31,14 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_m2m_100 import M2M100Config
 
 
diff --git a/src/transformers/models/marian/__init__.py b/src/transformers/models/marian/__init__.py
index 1094cb8b9d..b26355b484 100644
--- a/src/transformers/models/marian/__init__.py
+++ b/src/transformers/models/marian/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index c349f71a68..835b317f9d 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -18,10 +18,9 @@ from typing import Any, Mapping, Optional
 
 from ... import PreTrainedTokenizer
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import TensorType, is_torch_available
 from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
 from ...onnx.utils import compute_effective_axis_dimension
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index 9d8b44c5f9..d914cd410c 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -30,7 +30,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
@@ -45,7 +44,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_marian import MarianConfig
 
 
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index fac48f64d5..a886e8f4b9 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -27,12 +27,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -41,7 +35,13 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_marian import MarianConfig
 
 
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index d6522092a1..e116760474 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -22,13 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -47,7 +40,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_marian import MarianConfig
 
 
diff --git a/src/transformers/models/maskformer/__init__.py b/src/transformers/models/maskformer/__init__.py
index 547383672f..2f15ed34f0 100644
--- a/src/transformers/models/maskformer/__init__.py
+++ b/src/transformers/models/maskformer/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/maskformer/feature_extraction_maskformer.py b/src/transformers/models/maskformer/feature_extraction_maskformer.py
index 1d6ac398b1..5bde196050 100644
--- a/src/transformers/models/maskformer/feature_extraction_maskformer.py
+++ b/src/transformers/models/maskformer/feature_extraction_maskformer.py
@@ -20,9 +20,8 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType, is_torch_available
 from ...image_utils import ImageFeatureExtractionMixin, ImageInput, is_torch_tensor
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 if is_torch_available():
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 39af8a27eb..0d53c818dc 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -28,7 +28,9 @@ from torch import Tensor, nn
 from transformers.utils import logging
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutputWithCrossAttentions
+from ...modeling_utils import ModuleUtilsMixin, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -37,8 +39,6 @@ from ...file_utils import (
     replace_return_docstrings,
     requires_backends,
 )
-from ...modeling_outputs import BaseModelOutputWithCrossAttentions
-from ...modeling_utils import ModuleUtilsMixin, PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
 from ..detr import DetrConfig
 from ..swin import SwinConfig
 from .configuration_maskformer import MaskFormerConfig
diff --git a/src/transformers/models/mbart/__init__.py b/src/transformers/models/mbart/__init__.py
index cad664c140..294eb15f03 100644
--- a/src/transformers/models/mbart/__init__.py
+++ b/src/transformers/models/mbart/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index cf1d87835e..e4da61442d 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -18,10 +18,9 @@ from typing import Any, Mapping, Optional
 
 from ... import PreTrainedTokenizer
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import TensorType, is_torch_available
 from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
 from ...onnx.utils import compute_effective_axis_dimension
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 6130dad25d..728c2d691c 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -30,7 +30,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
@@ -47,7 +46,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_mbart import MBartConfig
 
 
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index ab0b1dfcbd..bceaac67be 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -24,13 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -41,7 +34,14 @@ from ...modeling_outputs import (
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_mbart import MBartConfig
 
 
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index d1d8f9174b..f536bd37b2 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -21,13 +21,6 @@ from typing import Optional, Tuple, Union
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -46,7 +39,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_mbart import MBartConfig
 
 
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index 79b87ee7d6..1de8d62f36 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -20,10 +20,9 @@ from typing import List, Optional, Tuple
 
 from tokenizers import processors
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/mbart50/__init__.py b/src/transformers/models/mbart50/__init__.py
index bb8b11d1f6..ee0edc94df 100644
--- a/src/transformers/models/mbart50/__init__.py
+++ b/src/transformers/models/mbart50/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available
 
 
 _import_structure = {}
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
index 928118bfd5..97e2584a0d 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -20,10 +20,9 @@ from typing import List, Optional, Tuple
 
 from tokenizers import processors
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken, BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/megatron_bert/__init__.py b/src/transformers/models/megatron_bert/__init__.py
index 2835d7a5b1..d49ab274e5 100644
--- a/src/transformers/models/megatron_bert/__init__.py
+++ b/src/transformers/models/megatron_bert/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index c28e4f0fa4..acf0574f4f 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -28,13 +28,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -52,7 +45,14 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_megatron_bert import MegatronBertConfig
 
 
diff --git a/src/transformers/models/mluke/__init__.py b/src/transformers/models/mluke/__init__.py
index 9842794def..acd6dff11f 100644
--- a/src/transformers/models/mluke/__init__.py
+++ b/src/transformers/models/mluke/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available
+from ...utils import _LazyModule, is_sentencepiece_available
 
 
 _import_structure = {}
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index 9e034615d5..d2ce9bfb82 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -25,7 +25,6 @@ import numpy as np
 
 import sentencepiece as spm
 
-from ...file_utils import add_end_docstrings, is_tf_available, is_torch_available
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -41,7 +40,7 @@ from ...tokenization_utils_base import (
     _is_torch,
     to_py_obj,
 )
-from ...utils import logging
+from ...utils import add_end_docstrings, is_tf_available, is_torch_available, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mmbt/__init__.py b/src/transformers/models/mmbt/__init__.py
index c1daba13a2..763a256f1a 100644
--- a/src/transformers/models/mmbt/__init__.py
+++ b/src/transformers/models/mmbt/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py
index 41024a6c84..9b9142befb 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/mmbt/modeling_mmbt.py
@@ -20,10 +20,9 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
 from ...modeling_utils import ModuleUtilsMixin
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/mobilebert/__init__.py b/src/transformers/models/mobilebert/__init__.py
index e460e348f7..505dabe187 100644
--- a/src/transformers/models/mobilebert/__init__.py
+++ b/src/transformers/models/mobilebert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index e29ad08b93..022c3dfd6f 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -31,13 +31,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -49,7 +42,14 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_mobilebert import MobileBertConfig
 
 
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 0a3597a099..12fc439b00 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -22,14 +22,6 @@ from typing import Dict, Optional, Tuple
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPooling,
@@ -53,7 +45,15 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_mobilebert import MobileBertConfig
 
 
diff --git a/src/transformers/models/mpnet/__init__.py b/src/transformers/models/mpnet/__init__.py
index 0ba9e8d6eb..54c2c7b841 100644
--- a/src/transformers/models/mpnet/__init__.py
+++ b/src/transformers/models/mpnet/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index d44966b560..d65be4d973 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -23,7 +23,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN, gelu
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -34,7 +33,7 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_mpnet import MPNetConfig
 
 
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index e75d76ffe4..003495b49c 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -22,12 +22,6 @@ import warnings
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPooling,
@@ -49,7 +43,13 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_mpnet import MPNetConfig
 
 
diff --git a/src/transformers/models/mt5/__init__.py b/src/transformers/models/mt5/__init__.py
index 99ab1f4b6a..dd576cb0b2 100644
--- a/src/transformers/models/mt5/__init__.py
+++ b/src/transformers/models/mt5/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/nystromformer/__init__.py b/src/transformers/models/nystromformer/__init__.py
index d4d3872be7..d3df751dd4 100644
--- a/src/transformers/models/nystromformer/__init__.py
+++ b/src/transformers/models/nystromformer/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index 64f721d224..b7a72821c7 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -24,7 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     MaskedLMOutput,
@@ -39,7 +38,7 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_nystromformer import NystromformerConfig
 
 
diff --git a/src/transformers/models/openai/__init__.py b/src/transformers/models/openai/__init__.py
index b077620361..3abba0b781 100644
--- a/src/transformers/models/openai/__init__.py
+++ b/src/transformers/models/openai/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
index c7576c4009..b57f2dd033 100755
--- a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -20,8 +20,7 @@ import argparse
 import torch
 
 from transformers import OpenAIGPTConfig, OpenAIGPTModel, load_tf_weights_in_openai_gpt
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
-from transformers.utils import logging
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 93e7313913..a6318a607b 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -27,13 +27,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import gelu_new, silu
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import (
     Conv1D,
@@ -42,7 +35,14 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_conv1d_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_openai import OpenAIGPTConfig
 
 
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 06a1edea52..374f8fb353 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -21,13 +21,6 @@ from typing import Optional, Tuple
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput, TFSequenceClassifierOutput
 from ...modeling_tf_utils import (
     TFCausalLanguageModelingLoss,
@@ -41,7 +34,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_openai import OpenAIGPTConfig
 
 
diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index bc7489c883..4d01c31c6d 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index 938d751b6f..d916c7de53 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -31,7 +31,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
@@ -47,7 +46,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_pegasus import PegasusConfig
 
 
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 06e52bd8ca..0128a99198 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -26,12 +26,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -40,7 +34,13 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_pegasus import PegasusConfig
 
 
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index eb542d2076..9a5951183a 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -22,13 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -47,7 +40,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_pegasus import PegasusConfig
 
 
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
index 67c065e7ec..c652c08095 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -19,9 +19,8 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/perceiver/__init__.py b/src/transformers/models/perceiver/__init__.py
index f226d8df0c..b208183064 100644
--- a/src/transformers/models/perceiver/__init__.py
+++ b/src/transformers/models/perceiver/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/perceiver/feature_extraction_perceiver.py b/src/transformers/models/perceiver/feature_extraction_perceiver.py
index 5ae9e1e4dc..62cb102bea 100644
--- a/src/transformers/models/perceiver/feature_extraction_perceiver.py
+++ b/src/transformers/models/perceiver/feature_extraction_perceiver.py
@@ -20,7 +20,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -28,7 +27,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 937e3cf52a..e973497bfc 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -28,12 +28,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import BaseModelOutputWithCrossAttentions
 from ...modeling_utils import (
     PreTrainedModel,
@@ -41,7 +35,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_perceiver import PerceiverConfig
 
 
diff --git a/src/transformers/models/phobert/__init__.py b/src/transformers/models/phobert/__init__.py
index 13c750964a..0f226f537a 100644
--- a/src/transformers/models/phobert/__init__.py
+++ b/src/transformers/models/phobert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ...utils import _LazyModule
 
 
 _import_structure = {
diff --git a/src/transformers/models/plbart/__init__.py b/src/transformers/models/plbart/__init__.py
index 5d0ff08e12..676feeb39c 100644
--- a/src/transformers/models/plbart/__init__.py
+++ b/src/transformers/models/plbart/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index 2e1c41f722..26d84c01fc 100755
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -24,13 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -40,7 +33,14 @@ from ...modeling_outputs import (
     Seq2SeqSequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_plbart import PLBartConfig
 
 
diff --git a/src/transformers/models/poolformer/__init__.py b/src/transformers/models/poolformer/__init__.py
index 246dc76455..799752067f 100644
--- a/src/transformers/models/poolformer/__init__.py
+++ b/src/transformers/models/poolformer/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/poolformer/feature_extraction_poolformer.py b/src/transformers/models/poolformer/feature_extraction_poolformer.py
index b7d44e2226..72d06cfd74 100644
--- a/src/transformers/models/poolformer/feature_extraction_poolformer.py
+++ b/src/transformers/models/poolformer/feature_extraction_poolformer.py
@@ -21,7 +21,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -29,7 +28,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py
index fcf9261889..6adf01315e 100755
--- a/src/transformers/models/poolformer/modeling_poolformer.py
+++ b/src/transformers/models/poolformer/modeling_poolformer.py
@@ -25,14 +25,14 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_poolformer import PoolFormerConfig
 
 
diff --git a/src/transformers/models/prophetnet/__init__.py b/src/transformers/models/prophetnet/__init__.py
index 45b7d9a5d4..be4baf4a16 100644
--- a/src/transformers/models/prophetnet/__init__.py
+++ b/src/transformers/models/prophetnet/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 2cfada0203..7c10261d27 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -26,15 +26,15 @@ from torch import Tensor, nn
 from torch.nn import LayerNorm
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_prophetnet import ProphetNetConfig
 
 
diff --git a/src/transformers/models/qdqbert/__init__.py b/src/transformers/models/qdqbert/__init__.py
index c403795e54..28fb61c219 100644
--- a/src/transformers/models/qdqbert/__init__.py
+++ b/src/transformers/models/qdqbert/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 544f6dba15..612a1bb975 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -27,14 +27,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_pytorch_quantization_available,
-    replace_return_docstrings,
-    requires_backends,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -47,7 +39,15 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_pytorch_quantization_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
 from .configuration_qdqbert import QDQBertConfig
 
 
diff --git a/src/transformers/models/rag/__init__.py b/src/transformers/models/rag/__init__.py
index 6154997879..00e88f7c0a 100644
--- a/src/transformers/models/rag/__init__.py
+++ b/src/transformers/models/rag/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index f009c8af8c..2897642a75 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -17,7 +17,7 @@
 import copy
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import add_start_docstrings
+from ...utils import add_start_docstrings
 
 
 RAG_CONFIG_DOC = r"""
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 480bce4037..d3c08f6165 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -21,13 +21,12 @@ import torch
 from torch import nn
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...generation_beam_search import BeamSearchScorer
 from ...generation_logits_process import LogitsProcessorList
 from ...generation_stopping_criteria import StoppingCriteriaList
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_rag import RagConfig
 from .retrieval_rag import RagRetriever
 
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 55099dc0be..5f652898b1 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -22,9 +22,8 @@ import numpy as np
 import tensorflow as tf
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, shape_list, unpack_inputs
-from ...utils import logging
+from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_rag import RagConfig
 from .retrieval_rag import RagRetriever
 
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index 6a498169d5..de2d03e6d6 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -21,10 +21,9 @@ from typing import Iterable, List, Optional, Tuple
 
 import numpy as np
 
-from ...file_utils import cached_path, is_datasets_available, is_faiss_available, is_remote_url, requires_backends
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import logging
+from ...utils import cached_path, is_datasets_available, is_faiss_available, is_remote_url, logging, requires_backends
 from .configuration_rag import RagConfig
 from .tokenization_rag import RagTokenizer
 
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index b44ddeedee..db113dbd5b 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 7cf3608c9f..94c0c730a3 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -25,7 +25,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -38,7 +37,7 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_realm import RealmConfig
 
 
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 3ddfd19e5a..426b5d775c 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -19,10 +19,9 @@ import os
 import unicodedata
 from typing import List, Optional, Tuple
 
-from ...file_utils import PaddingStrategy
 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import logging
+from ...utils import PaddingStrategy, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
index 7f55a72d09..87580baa22 100644
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/realm/tokenization_realm_fast.py
@@ -19,10 +19,9 @@ from typing import List, Optional, Tuple
 
 from tokenizers import normalizers
 
-from ...file_utils import PaddingStrategy
 from ...tokenization_utils_base import BatchEncoding
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import PaddingStrategy, logging
 from .tokenization_realm import RealmTokenizer
 
 
diff --git a/src/transformers/models/reformer/__init__.py b/src/transformers/models/reformer/__init__.py
index 246641bdd4..3c6130301b 100644
--- a/src/transformers/models/reformer/__init__.py
+++ b/src/transformers/models/reformer/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 4b98e7e0b7..b58e5a0289 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -29,17 +29,17 @@ from torch.autograd.function import Function
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
+from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
 )
-from ...modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
-from ...utils import logging
 from .configuration_reformer import ReformerConfig
 
 
diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py
index 86218eb73c..e6a8483791 100644
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -19,9 +19,8 @@ import os
 from shutil import copyfile
 from typing import Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/rembert/__init__.py b/src/transformers/models/rembert/__init__.py
index 03f3907b5d..fb5defeee5 100644
--- a/src/transformers/models/rembert/__init__.py
+++ b/src/transformers/models/rembert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_sentencepiece_available,
     is_tf_available,
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index df1f16eef8..2467a9a9af 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -25,12 +25,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -47,7 +41,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_rembert import RemBertConfig
 
 
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index f9e3307356..8d42805cae 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -22,13 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithPastAndCrossAttentions,
     TFBaseModelOutputWithPoolingAndCrossAttentions,
@@ -53,7 +46,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    DUMMY_INPUTS,
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_rembert import RemBertConfig
 
 
diff --git a/src/transformers/models/rembert/tokenization_rembert_fast.py b/src/transformers/models/rembert/tokenization_rembert_fast.py
index e55df93be6..13919bbb9b 100644
--- a/src/transformers/models/rembert/tokenization_rembert_fast.py
+++ b/src/transformers/models/rembert/tokenization_rembert_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/resnet/__init__.py b/src/transformers/models/resnet/__init__.py
index c561729d54..8a839228f8 100644
--- a/src/transformers/models/resnet/__init__.py
+++ b/src/transformers/models/resnet/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index 868d092634..bf27ea9891 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -23,10 +23,9 @@ from torch import Tensor, nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import ImageClassifierOutput, ModelOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_resnet import ResNetConfig
 
 
diff --git a/src/transformers/models/retribert/__init__.py b/src/transformers/models/retribert/__init__.py
index dc3f2f4ae6..e4d383780b 100644
--- a/src/transformers/models/retribert/__init__.py
+++ b/src/transformers/models/retribert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py
index cae4f8e15a..8470aea7ae 100644
--- a/src/transformers/models/retribert/modeling_retribert.py
+++ b/src/transformers/models/retribert/modeling_retribert.py
@@ -23,9 +23,8 @@ import torch
 import torch.utils.checkpoint as checkpoint
 from torch import nn
 
-from ...file_utils import add_start_docstrings
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, logging
 from ..bert.modeling_bert import BertModel
 from .configuration_retribert import RetriBertConfig
 
diff --git a/src/transformers/models/roberta/__init__.py b/src/transformers/models/roberta/__init__.py
index 05c778f553..0e1070dbe7 100644
--- a/src/transformers/models/roberta/__init__.py
+++ b/src/transformers/models/roberta/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index 7a33106362..78f00a95a0 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -24,7 +24,6 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPooling,
@@ -35,7 +34,7 @@ from ...modeling_flax_outputs import (
     FlaxTokenClassifierOutput,
 )
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_roberta import RobertaConfig
 
 
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 7920163e58..58fc3afc5b 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -25,12 +25,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN, gelu
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -47,7 +41,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_roberta import RobertaConfig
 
 
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index 28983c7fa6..4ae381451c 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -23,13 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    DUMMY_INPUTS,
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithPastAndCrossAttentions,
     TFBaseModelOutputWithPoolingAndCrossAttentions,
@@ -54,7 +47,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    DUMMY_INPUTS,
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_roberta import RobertaConfig
 
 
diff --git a/src/transformers/models/roformer/__init__.py b/src/transformers/models/roformer/__init__.py
index d7d8160e23..45806f082f 100644
--- a/src/transformers/models/roformer/__init__.py
+++ b/src/transformers/models/roformer/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/roformer/modeling_flax_roformer.py b/src/transformers/models/roformer/modeling_flax_roformer.py
index 6c7c941b28..d1a3e917a0 100644
--- a/src/transformers/models/roformer/modeling_flax_roformer.py
+++ b/src/transformers/models/roformer/modeling_flax_roformer.py
@@ -25,7 +25,6 @@ from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxMaskedLMOutput,
@@ -35,7 +34,7 @@ from ...modeling_flax_outputs import (
     FlaxTokenClassifierOutput,
 )
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_roformer import RoFormerConfig
 
 
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index da9ecce9d3..a2b60c46ce 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -26,12 +26,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -48,7 +42,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_roformer import RoFormerConfig
 
 
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 1402f69b74..4fd8611b6e 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -22,12 +22,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPooling,
@@ -53,7 +47,13 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_roformer import RoFormerConfig
 
 
diff --git a/src/transformers/models/segformer/__init__.py b/src/transformers/models/segformer/__init__.py
index 8c3ebc47e0..fed4e8127c 100644
--- a/src/transformers/models/segformer/__init__.py
+++ b/src/transformers/models/segformer/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/segformer/feature_extraction_segformer.py b/src/transformers/models/segformer/feature_extraction_segformer.py
index cfc53f2be9..c61abb696b 100644
--- a/src/transformers/models/segformer/feature_extraction_segformer.py
+++ b/src/transformers/models/segformer/feature_extraction_segformer.py
@@ -20,7 +20,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -28,7 +27,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 461b15f0c6..1d02b229ce 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -24,15 +24,15 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, SemanticSegmentationModelOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, SemanticSegmentationModelOutput, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
 from .configuration_segformer import SegformerConfig
 
 
diff --git a/src/transformers/models/sew/__init__.py b/src/transformers/models/sew/__init__.py
index 239e8fbfc2..4ee9380137 100644
--- a/src/transformers/models/sew/__init__.py
+++ b/src/transformers/models/sew/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index b0a0a1d7d7..2ddf7024cb 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -27,11 +27,10 @@ from torch.nn import CrossEntropyLoss
 from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import torch_int_div
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew import SEWConfig
 
 
diff --git a/src/transformers/models/sew_d/__init__.py b/src/transformers/models/sew_d/__init__.py
index 95b6a98fab..bc57740040 100644
--- a/src/transformers/models/sew_d/__init__.py
+++ b/src/transformers/models/sew_d/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 47f055badf..24470fc5d3 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -28,11 +28,10 @@ from torch.nn import CrossEntropyLoss, LayerNorm
 from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import softmax_backward_data, torch_int_div
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_sew_d import SEWDConfig
 
 
diff --git a/src/transformers/models/speech_encoder_decoder/__init__.py b/src/transformers/models/speech_encoder_decoder/__init__.py
index a7c425de20..a040990864 100644
--- a/src/transformers/models/speech_encoder_decoder/__init__.py
+++ b/src/transformers/models/speech_encoder_decoder/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index 6d0b7a2004..effc6100c2 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -24,10 +24,9 @@ from flax.core.frozen_dict import FrozenDict, unfreeze
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
 from ...modeling_flax_utils import FlaxPreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
 from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 445880591a..2057f97421 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -22,10 +22,9 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import Seq2SeqLMOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
 from .configuration_speech_encoder_decoder import SpeechEncoderDecoderConfig
diff --git a/src/transformers/models/speech_to_text/__init__.py b/src/transformers/models/speech_to_text/__init__.py
index a61724c616..0cccf66721 100644
--- a/src/transformers/models/speech_to_text/__init__.py
+++ b/src/transformers/models/speech_to_text/__init__.py
@@ -17,13 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
-    _LazyModule,
-    is_sentencepiece_available,
-    is_speech_available,
-    is_tf_available,
-    is_torch_available,
-)
+from ...utils import _LazyModule, is_sentencepiece_available, is_speech_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index 0f0dd8a123..aa23301755 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -24,8 +24,7 @@ import torchaudio.compliance.kaldi as ta_kaldi
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...file_utils import PaddingStrategy, TensorType
-from ...utils import logging
+from ...utils import PaddingStrategy, TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index b0d5ee7a28..c7f0341bbd 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -24,7 +24,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -32,7 +31,7 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_speech_to_text import Speech2TextConfig
 
 
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 11e457441d..7b5a2f4716 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -21,12 +21,6 @@ from typing import Dict, Optional, Tuple
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation, glu
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -41,7 +35,13 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_speech_to_text import Speech2TextConfig
 
 
diff --git a/src/transformers/models/speech_to_text_2/__init__.py b/src/transformers/models/speech_to_text_2/__init__.py
index 766035306e..d4ea8d037a 100644
--- a/src/transformers/models/speech_to_text_2/__init__.py
+++ b/src/transformers/models/speech_to_text_2/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_speech_available, is_torch_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_speech_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index 2678a06b29..13fe79b310 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -25,10 +25,9 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, replace_return_docstrings
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_speech_to_text_2 import Speech2Text2Config
 
 
diff --git a/src/transformers/models/splinter/__init__.py b/src/transformers/models/splinter/__init__.py
index e2d8341db3..6a2308bbf5 100644
--- a/src/transformers/models/splinter/__init__.py
+++ b/src/transformers/models/splinter/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 644d3c91fd..0bd51f4377 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -24,7 +24,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, QuestionAnsweringModelOutput
 from ...modeling_utils import (
     PreTrainedModel,
@@ -32,7 +31,7 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_splinter import SplinterConfig
 
 
diff --git a/src/transformers/models/squeezebert/__init__.py b/src/transformers/models/squeezebert/__init__.py
index 1abae4f538..433b9f9334 100644
--- a/src/transformers/models/squeezebert/__init__.py
+++ b/src/transformers/models/squeezebert/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index 9cda8f4067..733e5f7278 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -23,7 +23,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -34,7 +33,7 @@ from ...modeling_outputs import (
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_squeezebert import SqueezeBertConfig
 
 
diff --git a/src/transformers/models/swin/__init__.py b/src/transformers/models/swin/__init__.py
index a93f35512c..b8cb65d08b 100644
--- a/src/transformers/models/swin/__init__.py
+++ b/src/transformers/models/swin/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 45bf23d3cb..8677e74916 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -26,15 +26,15 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
 from .configuration_swin import SwinConfig
 
 
diff --git a/src/transformers/models/t5/__init__.py b/src/transformers/models/t5/__init__.py
index 7cb3cd8bb2..9ccb949328 100644
--- a/src/transformers/models/t5/__init__.py
+++ b/src/transformers/models/t5/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index c1c0ca40ed..a1387af50c 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -28,7 +28,6 @@ from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
@@ -43,7 +42,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_t5 import T5Config
 
 
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 00d888f278..d43030c07e 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -27,14 +27,6 @@ from torch.nn import CrossEntropyLoss
 from torch.utils.checkpoint import checkpoint
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_torch_fx_proxy,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -42,7 +34,15 @@ from ...modeling_outputs import (
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+    replace_return_docstrings,
+)
 from ...utils.model_parallel_utils import assert_device_map, get_device_map
 from .configuration_t5 import T5Config
 
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index 9928f38413..f3e8bc4b40 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -26,13 +26,6 @@ import tensorflow as tf
 from tensorflow.compiler.tf2xla.python.xla import dynamic_update_slice
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    DUMMY_INPUTS,
-    DUMMY_MASK,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFBaseModelOutputWithPastAndCrossAttentions,
@@ -48,7 +41,14 @@ from ...modeling_tf_utils import (
     keras_serializable,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_t5 import T5Config
 
 
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index 3749734245..4612868236 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -19,9 +19,8 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/tapas/__init__.py b/src/transformers/models/tapas/__init__.py
index 3af8f8f90e..4d3c72b85b 100644
--- a/src/transformers/models/tapas/__init__.py
+++ b/src/transformers/models/tapas/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 910d754600..fd3672ab18 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -27,14 +27,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_scatter_available,
-    replace_return_docstrings,
-    requires_backends,
-)
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
 from ...modeling_utils import (
     PreTrainedModel,
@@ -42,7 +34,15 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scatter_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
 from .configuration_tapas import TapasConfig
 
 
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index 6efa758a79..820156312b 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -23,14 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_tensorflow_probability_available,
-    replace_return_docstrings,
-    requires_backends,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithPastAndCrossAttentions,
     TFBaseModelOutputWithPooling,
@@ -47,7 +39,15 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_tensorflow_probability_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
 from .configuration_tapas import TapasConfig
 
 
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 69b960b1d0..dfa451b73a 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -28,7 +28,6 @@ from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union
 
 import numpy as np
 
-from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
 from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
@@ -37,7 +36,7 @@ from ...tokenization_utils_base import (
     PreTokenizedInput,
     TextInput,
 )
-from ...utils import logging
+from ...utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available, logging
 
 
 if is_pandas_available():
diff --git a/src/transformers/models/transfo_xl/__init__.py b/src/transformers/models/transfo_xl/__init__.py
index 5e1ec08f1b..ed01124a49 100644
--- a/src/transformers/models/transfo_xl/__init__.py
+++ b/src/transformers/models/transfo_xl/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index db040a31a8..abde04bd43 100755
--- a/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -23,10 +23,9 @@ import sys
 import torch
 
 from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
 from transformers.models.transfo_xl import tokenization_transfo_xl as data_utils
 from transformers.models.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
-from transformers.utils import logging
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index b61373698a..7cc64cd6d7 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -23,12 +23,6 @@ from typing import List, Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
 
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_utils import (
     TFModelInputType,
     TFPreTrainedModel,
@@ -38,7 +32,13 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_transfo_xl import TransfoXLConfig
 from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
 
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index 4c450cd477..6fbca9218f 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -25,14 +25,14 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...file_utils import (
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_transfo_xl import TransfoXLConfig
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
 
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index 5eb0c1b41c..115cd4fdcf 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -29,9 +29,8 @@ import numpy as np
 
 import sacremoses as sm
 
-from ...file_utils import cached_path, is_torch_available, torch_only_method
 from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ...utils import cached_path, is_torch_available, logging, torch_only_method
 
 
 if is_torch_available():
diff --git a/src/transformers/models/trocr/__init__.py b/src/transformers/models/trocr/__init__.py
index de419b336b..5f9f462e18 100644
--- a/src/transformers/models/trocr/__init__.py
+++ b/src/transformers/models/trocr/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_speech_available, is_torch_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_speech_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 9b63cf393a..fe5a045a56 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -25,10 +25,9 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, replace_return_docstrings
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, logging, replace_return_docstrings
 from .configuration_trocr import TrOCRConfig
 
 
diff --git a/src/transformers/models/unispeech/__init__.py b/src/transformers/models/unispeech/__init__.py
index 7df28d26d5..537b125ec0 100644
--- a/src/transformers/models/unispeech/__init__.py
+++ b/src/transformers/models/unispeech/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 33ba5d9685..06e94f8057 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -27,17 +27,17 @@ from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
-from ...utils import logging
 from .configuration_unispeech import UniSpeechConfig
 
 
diff --git a/src/transformers/models/unispeech_sat/__init__.py b/src/transformers/models/unispeech_sat/__init__.py
index b1c6a594aa..75a7397ff7 100644
--- a/src/transformers/models/unispeech_sat/__init__.py
+++ b/src/transformers/models/unispeech_sat/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 028387fec0..52fe1edcb6 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -27,17 +27,17 @@ from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
-from ...utils import logging
 from .configuration_unispeech_sat import UniSpeechSatConfig
 
 
diff --git a/src/transformers/models/van/__init__.py b/src/transformers/models/van/__init__.py
index 2c43a2ea7d..73e2752b1f 100644
--- a/src/transformers/models/van/__init__.py
+++ b/src/transformers/models/van/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/van/modeling_van.py b/src/transformers/models/van/modeling_van.py
index afdd37a606..d251897652 100644
--- a/src/transformers/models/van/modeling_van.py
+++ b/src/transformers/models/van/modeling_van.py
@@ -25,14 +25,14 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
 from .configuration_van import VanConfig
 
 
diff --git a/src/transformers/models/vilt/__init__.py b/src/transformers/models/vilt/__init__.py
index 09e5b59c56..7aa27b98de 100644
--- a/src/transformers/models/vilt/__init__.py
+++ b/src/transformers/models/vilt/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/vilt/feature_extraction_vilt.py b/src/transformers/models/vilt/feature_extraction_vilt.py
index 344bf98270..98c4ccc722 100644
--- a/src/transformers/models/vilt/feature_extraction_vilt.py
+++ b/src/transformers/models/vilt/feature_extraction_vilt.py
@@ -20,7 +20,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType, is_torch_available
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -28,7 +27,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, is_torch_available, logging
 
 
 if is_torch_available():
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index a885937e2a..36a9fbf9a7 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -26,7 +26,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -35,7 +34,7 @@ from ...modeling_outputs import (
     SequenceClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_vilt import ViltConfig
 
 
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 602d85de68..f7e7a3eda5 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -18,9 +18,9 @@ Processor class for ViLT.
 
 from typing import List, Optional, Union
 
-from ...file_utils import TensorType
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
 
 
 class ViltProcessor(ProcessorMixin):
diff --git a/src/transformers/models/vision_encoder_decoder/__init__.py b/src/transformers/models/vision_encoder_decoder/__init__.py
index 4f900c6cbc..0757f15ec8 100644
--- a/src/transformers/models/vision_encoder_decoder/__init__.py
+++ b/src/transformers/models/vision_encoder_decoder/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index b462b378fc..83f163e48f 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -25,10 +25,9 @@ from flax.core.frozen_dict import FrozenDict, unfreeze
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
 from ...modeling_flax_utils import FlaxPreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
 from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 1d63640af0..9b7a000310 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -22,17 +22,17 @@ from typing import Optional
 import tensorflow as tf
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import (
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
+from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing
+from ...tf_utils import shape_list
+from ...utils import (
     DUMMY_INPUTS,
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
-from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, input_processing
-from ...tf_utils import shape_list
-from ...utils import logging
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM
 from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 2f03c00eec..850a554a4d 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -22,10 +22,9 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...configuration_utils import PretrainedConfig
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import Seq2SeqLMOutput
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
 from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
diff --git a/src/transformers/models/vision_text_dual_encoder/__init__.py b/src/transformers/models/vision_text_dual_encoder/__init__.py
index fcc856c22f..4e705cd037 100644
--- a/src/transformers/models/vision_text_dual_encoder/__init__.py
+++ b/src/transformers/models/vision_text_dual_encoder/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_flax_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index bcd269c3cd..c96e7ae409 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -22,9 +22,8 @@ import jax
 import jax.numpy as jnp
 from flax.core.frozen_dict import FrozenDict
 
-from ...file_utils import add_start_docstrings
 from ...modeling_flax_utils import FlaxPreTrainedModel, append_replace_return_docstrings, overwrite_call_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, logging
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_flax_auto import FLAX_MODEL_MAPPING, FlaxAutoModel
 from ..clip.modeling_flax_clip import FlaxCLIPOutput, FlaxCLIPVisionModel
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index cb603922ab..6ec2cf987b 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -20,9 +20,8 @@ from typing import Optional
 import torch
 from torch import nn
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from ..auto.configuration_auto import AutoConfig
 from ..auto.modeling_auto import AutoModel
 from ..clip.modeling_clip import CLIPOutput, CLIPVisionConfig, CLIPVisionModel
diff --git a/src/transformers/models/visual_bert/__init__.py b/src/transformers/models/visual_bert/__init__.py
index 384914c458..444929e151 100644
--- a/src/transformers/models/visual_bert/__init__.py
+++ b/src/transformers/models/visual_bert/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index 176fdf53ec..3b90923435 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -25,12 +25,6 @@ from torch import nn
 from torch.nn import CrossEntropyLoss, KLDivLoss, LogSoftmax
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
@@ -43,7 +37,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_visual_bert import VisualBertConfig
 
 
diff --git a/src/transformers/models/vit/__init__.py b/src/transformers/models/vit/__init__.py
index ec0990fcca..c0331e27d9 100644
--- a/src/transformers/models/vit/__init__.py
+++ b/src/transformers/models/vit/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vision_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index fe0f0d824c..0a813a2c9c 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -20,7 +20,6 @@ import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...file_utils import TensorType
 from ...image_utils import (
     IMAGENET_STANDARD_MEAN,
     IMAGENET_STANDARD_STD,
@@ -28,7 +27,7 @@ from ...image_utils import (
     ImageInput,
     is_torch_tensor,
 )
-from ...utils import logging
+from ...utils import TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/vit/modeling_flax_vit.py b/src/transformers/models/vit/modeling_flax_vit.py
index aad3141dda..9340011ae2 100644
--- a/src/transformers/models/vit/modeling_flax_vit.py
+++ b/src/transformers/models/vit/modeling_flax_vit.py
@@ -21,7 +21,6 @@ import jax.numpy as jnp
 from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling, FlaxSequenceClassifierOutput
 from ...modeling_flax_utils import (
     ACT2FN,
@@ -29,6 +28,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from .configuration_vit import ViTConfig
 
 
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index 9818cf29d1..fdd7e85ce1 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -23,7 +23,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
 from ...modeling_tf_utils import (
     TFModelInputType,
@@ -34,7 +33,7 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_vit import ViTConfig
 
 
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 8c89cf9cac..059472616d 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -25,15 +25,15 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, MaskedLMOutput, SequenceClassifierOutput
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
 from .configuration_vit import ViTConfig
 
 
diff --git a/src/transformers/models/vit_mae/__init__.py b/src/transformers/models/vit_mae/__init__.py
index 7551c5f328..02ad4db319 100644
--- a/src/transformers/models/vit_mae/__init__.py
+++ b/src/transformers/models/vit_mae/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index 8b3d0f90dc..d2518a048b 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -27,15 +27,15 @@ import torch.utils.checkpoint
 from torch import nn
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
     replace_return_docstrings,
 )
-from ...modeling_outputs import BaseModelOutput
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import logging
 from .configuration_vit_mae import ViTMAEConfig
 
 
diff --git a/src/transformers/models/wav2vec2/__init__.py b/src/transformers/models/wav2vec2/__init__.py
index 24d5fb14db..93783b6682 100644
--- a/src/transformers/models/wav2vec2/__init__.py
+++ b/src/transformers/models/wav2vec2/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
index fcfe047166..93497652e1 100644
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -22,8 +22,7 @@ import numpy as np
 
 from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
 from ...feature_extraction_utils import BatchFeature
-from ...file_utils import PaddingStrategy, TensorType
-from ...utils import logging
+from ...utils import PaddingStrategy, TensorType, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index ae6707c051..6f1021f118 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -27,7 +27,6 @@ from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
 from ...modeling_flax_utils import (
     ACT2FN,
@@ -35,7 +34,7 @@ from ...modeling_flax_utils import (
     append_replace_return_docstrings,
     overwrite_call_docstring,
 )
-from ...utils import logging
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_wav2vec2 import Wav2Vec2Config
 
 
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index 2701236fd6..4152c38af1 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -23,17 +23,17 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput
 from ...modeling_tf_utils import TFPreTrainedModel, booleans_processing, get_initializer, keras_serializable
 from ...tf_utils import shape_list
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_wav2vec2 import Wav2Vec2Config
 
 
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index ccacb74118..9deed24cca 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -27,13 +27,6 @@ from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     CausalLMOutput,
@@ -43,7 +36,14 @@ from ...modeling_outputs import (
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import torch_int_div
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_wav2vec2 import Wav2Vec2Config
 
 
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index b32559074b..ddec5db3b4 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -24,7 +24,9 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
-from ...file_utils import (
+from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
+from ...tokenization_utils_base import AddedToken, BatchEncoding
+from ...utils import (
     ModelOutput,
     PaddingStrategy,
     TensorType,
@@ -32,11 +34,9 @@ from ...file_utils import (
     is_flax_available,
     is_tf_available,
     is_torch_available,
+    logging,
     to_py_obj,
 )
-from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
-from ...tokenization_utils_base import AddedToken, BatchEncoding
-from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/wav2vec2_phoneme/__init__.py b/src/transformers/models/wav2vec2_phoneme/__init__.py
index 7d66bee633..4d6ea18a33 100644
--- a/src/transformers/models/wav2vec2_phoneme/__init__.py
+++ b/src/transformers/models/wav2vec2_phoneme/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ...utils import _LazyModule
 
 
 # fmt: off
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index d6f8df3d81..b37f902c34 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -23,17 +23,17 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
-from ...file_utils import (
+from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
+from ...tokenization_utils_base import AddedToken
+from ...utils import (
     ModelOutput,
     is_flax_available,
     is_tf_available,
     is_torch_available,
+    logging,
     requires_backends,
     to_py_obj,
 )
-from ...tokenization_utils import PreTrainedTokenizer, _insert_one_token_to_ordered_list
-from ...tokenization_utils_base import AddedToken
-from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/wav2vec2_with_lm/__init__.py b/src/transformers/models/wav2vec2_with_lm/__init__.py
index d8fce39364..8730f3508e 100644
--- a/src/transformers/models/wav2vec2_with_lm/__init__.py
+++ b/src/transformers/models/wav2vec2_with_lm/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule
+from ...utils import _LazyModule
 
 
 # fmt: off
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 33dbfbb176..4e7da07526 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -23,8 +23,8 @@ from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
 
 import numpy as np
 
-from ...file_utils import ModelOutput, requires_backends
 from ...processing_utils import ProcessorMixin
+from ...utils import ModelOutput, requires_backends
 
 
 if TYPE_CHECKING:
diff --git a/src/transformers/models/wavlm/__init__.py b/src/transformers/models/wavlm/__init__.py
index c9ee046d4e..576bbaf83c 100644
--- a/src/transformers/models/wavlm/__init__.py
+++ b/src/transformers/models/wavlm/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index 08e789b700..7c99a5d780 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -28,16 +28,16 @@ from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...deepspeed import is_deepspeed_zero3_enabled
-from ...file_utils import (
+from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import torch_int_div
+from ...utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    logging,
 )
-from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, TokenClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import torch_int_div
-from ...utils import logging
 from .configuration_wavlm import WavLMConfig
 
 
diff --git a/src/transformers/models/xglm/__init__.py b/src/transformers/models/xglm/__init__.py
index 2b19d26f8d..ddc79c6787 100644
--- a/src/transformers/models/xglm/__init__.py
+++ b/src/transformers/models/xglm/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py
index dd7e1f0da9..01f8f33a09 100644
--- a/src/transformers/models/xglm/modeling_flax_xglm.py
+++ b/src/transformers/models/xglm/modeling_flax_xglm.py
@@ -31,13 +31,12 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutputWithPastAndCrossAttentions,
     FlaxCausalLMOutputWithCrossAttentions,
 )
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
-from ...utils import logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_xglm import XGLMConfig
 
 
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 6a7f9347b3..0dda424d54 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -25,10 +25,9 @@ from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_xglm import XGLMConfig
 
 
diff --git a/src/transformers/models/xglm/tokenization_xglm_fast.py b/src/transformers/models/xglm/tokenization_xglm_fast.py
index 9280838a8f..a0d4cebafe 100644
--- a/src/transformers/models/xglm/tokenization_xglm_fast.py
+++ b/src/transformers/models/xglm/tokenization_xglm_fast.py
@@ -18,9 +18,8 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/xlm/__init__.py b/src/transformers/models/xlm/__init__.py
index 7de227e87a..f0a42e244e 100644
--- a/src/transformers/models/xlm/__init__.py
+++ b/src/transformers/models/xlm/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index 99c837765c..4221cdfc90 100755
--- a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -21,9 +21,8 @@ import json
 import numpy
 import torch
 
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
 from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES
-from transformers.utils import logging
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index ce975327da..f677aa9eb5 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -25,13 +25,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-)
 from ...modeling_tf_outputs import (
     TFBaseModelOutput,
     TFMultipleChoiceModelOutput,
@@ -52,7 +45,14 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
 from .configuration_xlm import XLMConfig
 
 
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 2b26dd3fbe..4549089dea 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -27,13 +27,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import gelu
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutput,
     MaskedLMOutput,
@@ -50,7 +43,14 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_xlm import XLMConfig
 
 
diff --git a/src/transformers/models/xlm_prophetnet/__init__.py b/src/transformers/models/xlm_prophetnet/__init__.py
index cd19330301..fe69b50607 100644
--- a/src/transformers/models/xlm_prophetnet/__init__.py
+++ b/src/transformers/models/xlm_prophetnet/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_sentencepiece_available, is_torch_available
+from ...utils import _LazyModule, is_sentencepiece_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/xlm_roberta/__init__.py b/src/transformers/models/xlm_roberta/__init__.py
index b854816ea7..a29a400c8b 100644
--- a/src/transformers/models/xlm_roberta/__init__.py
+++ b/src/transformers/models/xlm_roberta/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_flax_available,
     is_sentencepiece_available,
diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
index a6e0d47642..19c25346b6 100644
--- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """Flax XLM-RoBERTa model."""
 
-from ...file_utils import add_start_docstrings
-from ...utils import logging
+from ...utils import add_start_docstrings, logging
 from ..roberta.modeling_flax_roberta import (
     FlaxRobertaForMaskedLM,
     FlaxRobertaForMultipleChoice,
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index c699676672..0edd4158fb 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """ TF 2.0 XLM-RoBERTa model."""
 
-from ...file_utils import add_start_docstrings
-from ...utils import logging
+from ...utils import add_start_docstrings, logging
 from ..roberta.modeling_tf_roberta import (
     TFRobertaForCausalLM,
     TFRobertaForMaskedLM,
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 2221502ff5..fb881bb47c 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -15,8 +15,7 @@
 # limitations under the License.
 """PyTorch XLM-RoBERTa model."""
 
-from ...file_utils import add_start_docstrings
-from ...utils import logging
+from ...utils import add_start_docstrings, logging
 from ..roberta.modeling_roberta import (
     RobertaForCausalLM,
     RobertaForMaskedLM,
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index 62663347f8..119d2fa080 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/xlm_roberta_xl/__init__.py b/src/transformers/models/xlm_roberta_xl/__init__.py
index 8a3988b81c..765a235f29 100644
--- a/src/transformers/models/xlm_roberta_xl/__init__.py
+++ b/src/transformers/models/xlm_roberta_xl/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index 816c21f6a5..0ff283ae30 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -24,12 +24,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN, gelu
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -46,7 +40,13 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_xlm_roberta_xl import XLMRobertaXLConfig
 
 
diff --git a/src/transformers/models/xlnet/__init__.py b/src/transformers/models/xlnet/__init__.py
index 22303316f3..599448a271 100644
--- a/src/transformers/models/xlnet/__init__.py
+++ b/src/transformers/models/xlnet/__init__.py
@@ -18,7 +18,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...file_utils import (
+from ...utils import (
     _LazyModule,
     is_sentencepiece_available,
     is_tf_available,
diff --git a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index c2cabde0be..f6fc73ca0e 100755
--- a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -27,8 +27,7 @@ from transformers import (
     XLNetLMHeadModel,
     load_tf_weights_in_xlnet,
 )
-from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME
-from transformers.utils import logging
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
 GLUE_TASKS_NUM_LABELS = {
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index ad1a29a635..c164df4155 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -25,14 +25,6 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
-    MULTIPLE_CHOICE_DUMMY_INPUTS,
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_tf_utils import (
     TFCausalLanguageModelingLoss,
     TFModelInputType,
@@ -48,7 +40,15 @@ from ...modeling_tf_utils import (
     unpack_inputs,
 )
 from ...tf_utils import shape_list
-from ...utils import logging
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_xlnet import XLNetConfig
 
 
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 44a2966148..e490365104 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -25,13 +25,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    ModelOutput,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
 from ...modeling_utils import (
     PoolerAnswerClass,
     PoolerEndLogits,
@@ -40,7 +33,14 @@ from ...modeling_utils import (
     SequenceSummary,
     apply_chunking_to_forward,
 )
-from ...utils import logging
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_xlnet import XLNetConfig
 
 
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 0dc7d9e72c..920a9f5cb7 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -22,9 +22,8 @@ from typing import Any, Dict, List, Optional, Tuple
 
 import sentencepiece as spm
 
-from ...file_utils import SPIECE_UNDERLINE
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...utils import logging
+from ...utils import SPIECE_UNDERLINE, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index 863f73a59b..c27c5262f9 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -19,10 +19,9 @@ import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
-from ...file_utils import is_sentencepiece_available
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
+from ...utils import is_sentencepiece_available, logging
 
 
 if is_sentencepiece_available():
diff --git a/src/transformers/models/yoso/__init__.py b/src/transformers/models/yoso/__init__.py
index 6b1c7eb5ae..5dff89595c 100644
--- a/src/transformers/models/yoso/__init__.py
+++ b/src/transformers/models/yoso/__init__.py
@@ -18,7 +18,7 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index 2b256f1c8d..5914454f5c 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -26,7 +26,6 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_outputs import (
     BaseModelOutputWithCrossAttentions,
     MaskedLMOutput,
@@ -41,7 +40,7 @@ from ...modeling_utils import (
     find_pruneable_heads_and_indices,
     prune_linear_layer,
 )
-from ...utils import logging
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_yoso import YosoConfig
 
 
diff --git a/src/transformers/onnx/__init__.py b/src/transformers/onnx/__init__.py
index ddd8d9a2ab..4491aefba9 100644
--- a/src/transformers/onnx/__init__.py
+++ b/src/transformers/onnx/__init__.py
@@ -15,7 +15,7 @@
 
 from typing import TYPE_CHECKING
 
-from ..file_utils import _LazyModule
+from ..utils import _LazyModule
 
 
 _import_structure = {
diff --git a/src/transformers/onnx/config.py b/src/transformers/onnx/config.py
index 91cfee0e07..8a7437c45f 100644
--- a/src/transformers/onnx/config.py
+++ b/src/transformers/onnx/config.py
@@ -21,8 +21,7 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Mapping,
 import numpy as np
 from packaging import version
 
-from ..file_utils import TensorType, is_torch_available, is_vision_available
-from ..utils import logging
+from ..utils import TensorType, is_torch_available, is_vision_available, logging
 from .utils import ParameterFormat, compute_effective_axis_dimension, compute_serialized_parameters_size
 
 
@@ -204,7 +203,7 @@ class OnnxConfig(ABC):
             `bool`: Whether the installed version of PyTorch is compatible with the model.
         """
         if is_torch_available():
-            from transformers.file_utils import torch_version
+            from transformers.utils import torch_version
 
             return torch_version >= self.torch_onnx_minimum_version
         else:
diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index cb646948a8..69aca2a43a 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -21,9 +21,14 @@ from typing import TYPE_CHECKING, Iterable, List, Tuple, Union
 import numpy as np
 from packaging.version import Version, parse
 
-from ..file_utils import TensorType, is_tf_available, is_torch_available, is_torch_onnx_dict_inputs_support_available
 from ..tokenization_utils_base import PreTrainedTokenizerBase
-from ..utils import logging
+from ..utils import (
+    TensorType,
+    is_tf_available,
+    is_torch_available,
+    is_torch_onnx_dict_inputs_support_available,
+    logging,
+)
 from .config import OnnxConfig
 
 
@@ -294,7 +299,7 @@ def export(
         preprocessor = tokenizer
 
     if is_torch_available():
-        from ..file_utils import torch_version
+        from ..utils import torch_version
 
         if not is_torch_onnx_dict_inputs_support_available():
             raise AssertionError(f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}")
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 94d422f3ab..fef0813b22 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -25,12 +25,11 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 from ..configuration_utils import PretrainedConfig
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
-from ..file_utils import http_get, is_tf_available, is_torch_available
 from ..models.auto.configuration_auto import AutoConfig
 from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
-from ..utils import logging
+from ..utils import http_get, is_tf_available, is_torch_available, logging
 from .audio_classification import AudioClassificationPipeline
 from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from .base import (
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index 3957d82beb..bb96a66d0e 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -16,8 +16,7 @@ from typing import Union
 
 import numpy as np
 
-from ..file_utils import add_end_docstrings, is_torch_available
-from ..utils import logging
+from ..utils import add_end_docstrings, is_torch_available, logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index a93569c5d2..1b92231568 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -16,8 +16,7 @@ from typing import TYPE_CHECKING, Dict, Optional, Union
 
 import numpy as np
 
-from ..file_utils import is_torch_available
-from ..utils import logging
+from ..utils import is_torch_available, logging
 from .audio_utils import ffmpeg_read
 from .base import ChunkPipeline
 
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 2f966337e6..d54a17df1e 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -30,11 +30,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 from packaging import version
 
 from ..feature_extraction_utils import PreTrainedFeatureExtractor
-from ..file_utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available
 from ..modelcard import ModelCard
 from ..models.auto.configuration_auto import AutoConfig
 from ..tokenization_utils import PreTrainedTokenizer
-from ..utils import logging
+from ..utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available, logging
 
 
 GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
diff --git a/src/transformers/pipelines/conversational.py b/src/transformers/pipelines/conversational.py
index ab28bc2e65..0f8a41ebfd 100644
--- a/src/transformers/pipelines/conversational.py
+++ b/src/transformers/pipelines/conversational.py
@@ -1,8 +1,7 @@
 import uuid
 from typing import Any, Dict, List, Optional, Union
 
-from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
-from ..utils import logging
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
diff --git a/src/transformers/pipelines/fill_mask.py b/src/transformers/pipelines/fill_mask.py
index 4a81d66fd4..4a0c155576 100644
--- a/src/transformers/pipelines/fill_mask.py
+++ b/src/transformers/pipelines/fill_mask.py
@@ -2,8 +2,7 @@ from typing import Dict
 
 import numpy as np
 
-from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
-from ..utils import logging
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
 from .base import PIPELINE_INIT_ARGS, GenericTensor, Pipeline, PipelineException
 
 
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 2afb084b91..d39a188168 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -1,13 +1,13 @@
 from typing import List, Union
 
-from ..file_utils import (
+from ..utils import (
     add_end_docstrings,
     is_tf_available,
     is_torch_available,
     is_vision_available,
+    logging,
     requires_backends,
 )
-from ..utils import logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 923a99ae9c..a33095cfc2 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -2,8 +2,7 @@ from typing import Any, Dict, List, Union
 
 import numpy as np
 
-from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends
-from ..utils import logging
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 2d801ee252..f553d28c35 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, List, Union
 
-from ..file_utils import add_end_docstrings, is_torch_available, is_vision_available, requires_backends
-from ..utils import logging
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index efab83b92f..7ef33b1706 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -5,10 +5,9 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 import numpy as np
 
 from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
-from ..file_utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available
 from ..modelcard import ModelCard
 from ..tokenization_utils import PreTrainedTokenizer
-from ..utils import logging
+from ..utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available, logging
 from .base import PIPELINE_INIT_ARGS, ArgumentHandler, ChunkPipeline
 
 
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index f37eb16603..0e2df2cd8a 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -3,7 +3,7 @@ import types
 
 import numpy as np
 
-from ..file_utils import (
+from ..utils import (
     add_end_docstrings,
     is_tensorflow_probability_available,
     is_tf_available,
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 6b194a5cf9..ee9be76f44 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -1,8 +1,7 @@
 import enum
 
-from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
 from ..tokenization_utils import TruncationStrategy
-from ..utils import logging
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index c2730304c1..3d3f4e533d 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -2,7 +2,7 @@ from typing import Dict
 
 import numpy as np
 
-from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
+from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
 from .base import PIPELINE_INIT_ARGS, GenericTensor, Pipeline
 
 
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 0c1d0093b7..3e7a34bfec 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -2,7 +2,7 @@ import enum
 
 from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, TF_MODEL_FOR_CAUSAL_LM_MAPPING
 
-from ..file_utils import add_end_docstrings, is_tf_available
+from ..utils import add_end_docstrings, is_tf_available
 from .base import PIPELINE_INIT_ARGS, Pipeline
 
 
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 56fe453dfb..4ea8d11415 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -4,8 +4,8 @@ from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
-from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
 from ..models.bert.tokenization_bert import BasicTokenizer
+from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
 from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline
 
 
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index bcc4c22f02..9d5d5bd61b 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -2,9 +2,8 @@ from typing import List, Union
 
 import numpy as np
 
-from ..file_utils import add_end_docstrings
 from ..tokenization_utils import TruncationStrategy
-from ..utils import logging
+from ..utils import add_end_docstrings, logging
 from .base import PIPELINE_INIT_ARGS, ArgumentHandler, ChunkPipeline
 
 
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 859d942b23..951f11ef0d 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -1,13 +1,13 @@
 from typing import List, Union
 
-from ..file_utils import (
+from ..utils import (
     add_end_docstrings,
     is_tf_available,
     is_torch_available,
     is_vision_available,
+    logging,
     requires_backends,
 )
-from ..utils import logging
 from .base import PIPELINE_INIT_ARGS, ChunkPipeline
 
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index dad3d5c7d6..f4fdad5c63 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -21,9 +21,8 @@ import os
 from pathlib import Path
 
 from .dynamic_module_utils import custom_object_save
-from .file_utils import PushToHubMixin, copy_func
 from .tokenization_utils_base import PreTrainedTokenizerBase
-from .utils import logging
+from .utils import PushToHubMixin, copy_func, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/sagemaker/training_args_sm.py b/src/transformers/sagemaker/training_args_sm.py
index 0a01c1dc0f..f2b0ee8f5e 100644
--- a/src/transformers/sagemaker/training_args_sm.py
+++ b/src/transformers/sagemaker/training_args_sm.py
@@ -20,9 +20,8 @@ from dataclasses import dataclass, field
 
 import torch
 
-from transformers.file_utils import cached_property, is_sagemaker_dp_enabled
 from transformers.training_args import TrainingArguments
-from transformers.utils import logging
+from transformers.utils import cached_property, is_sagemaker_dp_enabled, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index e06e76d896..ec681597f0 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -30,7 +30,8 @@ from unittest import mock
 from transformers import logging as transformers_logging
 
 from .deepspeed import is_deepspeed_available
-from .file_utils import (
+from .integrations import is_optuna_available, is_ray_available, is_sigopt_available, is_wandb_available
+from .utils import (
     is_detectron2_available,
     is_faiss_available,
     is_flax_available,
@@ -60,7 +61,6 @@ from .file_utils import (
     is_torchaudio_available,
     is_vision_available,
 )
-from .integrations import is_optuna_available, is_ray_available, is_sigopt_available, is_wandb_available
 
 
 SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index fef4c4a623..694b55cedd 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -23,7 +23,6 @@ import unicodedata
 from collections import OrderedDict
 from typing import Any, Dict, List, Optional, Tuple, Union, overload
 
-from .file_utils import PaddingStrategy, TensorType, add_end_docstrings
 from .tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
     ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
@@ -39,7 +38,7 @@ from .tokenization_utils_base import (
     TextInputPair,
     TruncationStrategy,
 )
-from .utils import logging
+from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index ee7722b376..c2a2b7bb6c 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -35,7 +35,7 @@ from requests import HTTPError
 
 from . import __version__
 from .dynamic_module_utils import custom_object_save
-from .file_utils import (
+from .utils import (
     EntryNotFoundError,
     ExplicitEnum,
     PaddingStrategy,
@@ -43,11 +43,6 @@ from .file_utils import (
     RepositoryNotFoundError,
     RevisionNotFoundError,
     TensorType,
-    _is_jax,
-    _is_numpy,
-    _is_tensorflow,
-    _is_torch,
-    _is_torch_device,
     add_end_docstrings,
     cached_path,
     copy_func,
@@ -59,10 +54,11 @@ from .file_utils import (
     is_tf_available,
     is_tokenizers_available,
     is_torch_available,
+    logging,
     to_py_obj,
     torch_required,
 )
-from .utils import logging
+from .utils.generic import _is_jax, _is_numpy, _is_tensorflow, _is_torch, _is_torch_device
 
 
 if TYPE_CHECKING:
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index d14b02d11f..3a47cc2651 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -27,7 +27,6 @@ from tokenizers.decoders import Decoder as DecoderFast
 from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
 
 from .convert_slow_tokenizer import convert_slow_tokenizer
-from .file_utils import PaddingStrategy, add_end_docstrings
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_utils_base import (
     INIT_TOKENIZER_DOCSTRING,
@@ -41,7 +40,7 @@ from .tokenization_utils_base import (
     TextInputPair,
     TruncationStrategy,
 )
-from .utils import logging
+from .utils import PaddingStrategy, add_end_docstrings, logging
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f03bb5f66d..1bf6fde9fc 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -64,17 +64,6 @@ from .data.data_collator import DataCollator, DataCollatorWithPadding, default_d
 from .debug_utils import DebugOption, DebugUnderflowOverflow
 from .deepspeed import deepspeed_init, deepspeed_reinit, is_deepspeed_zero3_enabled
 from .dependency_versions_check import dep_version_check
-from .file_utils import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    get_full_repo_name,
-    is_apex_available,
-    is_datasets_available,
-    is_in_notebook,
-    is_sagemaker_dp_enabled,
-    is_sagemaker_mp_enabled,
-    is_torch_tpu_available,
-)
 from .modelcard import TrainingSummary
 from .modeling_utils import PreTrainedModel, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
@@ -131,7 +120,18 @@ from .trainer_utils import (
     speed_metrics,
 )
 from .training_args import OptimizerNames, ParallelMode, TrainingArguments
-from .utils import logging
+from .utils import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    get_full_repo_name,
+    is_apex_available,
+    is_datasets_available,
+    is_in_notebook,
+    is_sagemaker_dp_enabled,
+    is_sagemaker_mp_enabled,
+    is_torch_tpu_available,
+    logging,
+)
 
 
 _is_torch_generator_available = False
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 58efc2e4b1..63842d4849 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -34,14 +34,14 @@ from torch import nn
 from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler
 from torch.utils.data.distributed import DistributedSampler
 
-from .file_utils import (
+from .tokenization_utils_base import BatchEncoding
+from .utils import (
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
     is_torch_tpu_available,
     is_training_run_on_sagemaker,
+    logging,
 )
-from .tokenization_utils_base import BatchEncoding
-from .utils import logging
 
 
 if is_sagemaker_dp_enabled():
diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py
index afcabb32cc..71c2e691d2 100644
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -19,7 +19,7 @@ import os
 import warnings
 from typing import Callable, Dict, Optional, Tuple
 
-from .file_utils import ENV_VARS_TRUE_VALUES
+from .utils import ENV_VARS_TRUE_VALUES
 
 
 # Integrations must be imported before ML frameworks:
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index e4b896edbd..04f5dc1058 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -29,7 +29,7 @@ from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
 
-from .file_utils import (
+from .utils import (
     ExplicitEnum,
     is_psutil_available,
     is_sagemaker_dp_enabled,
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 52497a15ce..d8d0cfa124 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -23,7 +23,8 @@ from pathlib import Path
 from typing import Any, Dict, List, Optional
 
 from .debug_utils import DebugOption
-from .file_utils import (
+from .trainer_utils import EvaluationStrategy, HubStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption
+from .utils import (
     ExplicitEnum,
     cached_property,
     get_full_repo_name,
@@ -33,10 +34,9 @@ from .file_utils import (
     is_torch_bf16_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
+    logging,
     torch_required,
 )
-from .trainer_utils import EvaluationStrategy, HubStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption
-from .utils import logging
 
 
 if is_torch_available():
diff --git a/src/transformers/training_args_seq2seq.py b/src/transformers/training_args_seq2seq.py
index 1b6be72654..ef3ccdf260 100644
--- a/src/transformers/training_args_seq2seq.py
+++ b/src/transformers/training_args_seq2seq.py
@@ -16,8 +16,8 @@ import logging
 from dataclasses import dataclass, field
 from typing import Optional
 
-from .file_utils import add_start_docstrings
 from .training_args import TrainingArguments
+from .utils import add_start_docstrings
 
 
 logger = logging.getLogger(__name__)
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 397e288fca..4f3c41e2ca 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -16,9 +16,8 @@ import warnings
 from dataclasses import dataclass, field
 from typing import Tuple
 
-from .file_utils import cached_property, is_tf_available, tf_required
 from .training_args import TrainingArguments
-from .utils import logging
+from .utils import cached_property, is_tf_available, logging, tf_required
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 4c598415d5..b8c6cb65af 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -1,5 +1,10 @@
 #!/usr/bin/env python
 # coding=utf-8
+
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
 # Copyright 2021 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,6 +22,135 @@
 from packaging import version
 
 from .. import __version__
+from .doc import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    copy_func,
+    replace_return_docstrings,
+)
+from .generic import (
+    ContextManagers,
+    ExplicitEnum,
+    ModelOutput,
+    PaddingStrategy,
+    TensorType,
+    cached_property,
+    is_tensor,
+    to_numpy,
+    to_py_obj,
+)
+from .hub import (
+    CLOUDFRONT_DISTRIB_PREFIX,
+    DISABLE_TELEMETRY,
+    HF_MODULES_CACHE,
+    HUGGINGFACE_CO_PREFIX,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    S3_BUCKET_PREFIX,
+    TRANSFORMERS_CACHE,
+    TRANSFORMERS_DYNAMIC_MODULE_NAME,
+    EntryNotFoundError,
+    PushToHubMixin,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    cached_path,
+    default_cache_path,
+    define_sagemaker_information,
+    filename_to_url,
+    get_cached_models,
+    get_file_from_repo,
+    get_from_cache,
+    get_full_repo_name,
+    get_list_of_files,
+    has_file,
+    hf_bucket_url,
+    http_get,
+    http_user_agent,
+    is_local_clone,
+    is_offline_mode,
+    is_remote_url,
+    url_to_filename,
+)
+from .import_utils import (
+    ENV_VARS_TRUE_AND_AUTO_VALUES,
+    ENV_VARS_TRUE_VALUES,
+    TORCH_FX_REQUIRED_VERSION,
+    USE_JAX,
+    USE_TF,
+    USE_TORCH,
+    DummyObject,
+    _LazyModule,
+    is_apex_available,
+    is_coloredlogs_available,
+    is_datasets_available,
+    is_detectron2_available,
+    is_faiss_available,
+    is_flax_available,
+    is_ftfy_available,
+    is_in_notebook,
+    is_librosa_available,
+    is_onnx_available,
+    is_pandas_available,
+    is_phonemizer_available,
+    is_protobuf_available,
+    is_psutil_available,
+    is_py3nvml_available,
+    is_pyctcdecode_available,
+    is_pytesseract_available,
+    is_pytorch_quantization_available,
+    is_rjieba_available,
+    is_sagemaker_dp_enabled,
+    is_sagemaker_mp_enabled,
+    is_scatter_available,
+    is_scipy_available,
+    is_sentencepiece_available,
+    is_sklearn_available,
+    is_soundfile_availble,
+    is_spacy_available,
+    is_speech_available,
+    is_tensorflow_probability_available,
+    is_tf2onnx_available,
+    is_tf_available,
+    is_timm_available,
+    is_tokenizers_available,
+    is_torch_available,
+    is_torch_bf16_available,
+    is_torch_cuda_available,
+    is_torch_fx_available,
+    is_torch_fx_proxy,
+    is_torch_onnx_dict_inputs_support_available,
+    is_torch_tf32_available,
+    is_torch_tpu_available,
+    is_torchaudio_available,
+    is_training_run_on_sagemaker,
+    is_vision_available,
+    requires_backends,
+    tf_required,
+    torch_only_method,
+    torch_required,
+    torch_version,
+)
+
+
+WEIGHTS_NAME = "pytorch_model.bin"
+TF2_WEIGHTS_NAME = "tf_model.h5"
+TF_WEIGHTS_NAME = "model.ckpt"
+FLAX_WEIGHTS_NAME = "flax_model.msgpack"
+CONFIG_NAME = "config.json"
+FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
+MODEL_CARD_NAME = "modelcard.json"
+
+SENTENCEPIECE_UNDERLINE = "▁"
+SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility
+
+MULTIPLE_CHOICE_DUMMY_INPUTS = [
+    [[0, 1, 0, 1], [1, 0, 0, 1]]
+] * 2  # Needs to have 0s and 1s only since XLM uses it for langs too.
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
 
 
 def check_min_version(min_version):
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
new file mode 100644
index 0000000000..fdb65d6de5
--- /dev/null
+++ b/src/transformers/utils/doc.py
@@ -0,0 +1,936 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Doc utilities: Utilities related to documentation
+"""
+
+import functools
+import re
+import types
+
+
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_start_docstrings_to_model_forward(*docstr):
+    def docstring_decorator(fn):
+        docstring = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        class_name = f"[`{fn.__qualname__.split('.')[0]}`]"
+        intro = f"   The {class_name} forward method, overrides the `__call__` special method."
+        note = r"""
+
+    <Tip>
+
+    Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]
+    instance afterwards instead of this since the former takes care of running the pre and post processing steps while
+    the latter silently ignores them.
+
+    </Tip>
+"""
+
+        fn.__doc__ = intro + note + docstring
+        return fn
+
+    return docstring_decorator
+
+
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else "") + "".join(docstr)
+        return fn
+
+    return docstring_decorator
+
+
+PT_RETURN_INTRODUCTION = r"""
+    Returns:
+        [`{full_output_type}`] or `tuple(torch.FloatTensor)`: A [`{full_output_type}`] or a tuple of
+        `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various
+        elements depending on the configuration ([`{config_class}`]) and inputs.
+
+"""
+
+
+TF_RETURN_INTRODUCTION = r"""
+    Returns:
+        [`{full_output_type}`] or `tuple(tf.Tensor)`: A [`{full_output_type}`] or a tuple of `tf.Tensor` (if
+        `return_dict=False` is passed or when `config.return_dict=False`) comprising various elements depending on the
+        configuration ([`{config_class}`]) and inputs.
+
+"""
+
+
+def _get_indent(t):
+    """Returns the indentation in the first line of t"""
+    search = re.search(r"^(\s*)\S", t)
+    return "" if search is None else search.groups()[0]
+
+
+def _convert_output_args_doc(output_args_doc):
+    """Convert output_args_doc to display properly."""
+    # Split output_arg_doc in blocks argument/description
+    indent = _get_indent(output_args_doc)
+    blocks = []
+    current_block = ""
+    for line in output_args_doc.split("\n"):
+        # If the indent is the same as the beginning, the line is the name of new arg.
+        if _get_indent(line) == indent:
+            if len(current_block) > 0:
+                blocks.append(current_block[:-1])
+            current_block = f"{line}\n"
+        else:
+            # Otherwise it's part of the description of the current arg.
+            # We need to remove 2 spaces to the indentation.
+            current_block += f"{line[2:]}\n"
+    blocks.append(current_block[:-1])
+
+    # Format each block for proper rendering
+    for i in range(len(blocks)):
+        blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i])
+        blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i])
+
+    return "\n".join(blocks)
+
+
+def _prepare_output_docstrings(output_type, config_class, min_indent=None):
+    """
+    Prepares the return part of the docstring using `output_type`.
+    """
+    output_docstring = output_type.__doc__
+
+    # Remove the head of the docstring to keep the list of args only
+    lines = output_docstring.split("\n")
+    i = 0
+    while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None:
+        i += 1
+    if i < len(lines):
+        params_docstring = "\n".join(lines[(i + 1) :])
+        params_docstring = _convert_output_args_doc(params_docstring)
+
+    # Add the return introduction
+    full_output_type = f"{output_type.__module__}.{output_type.__name__}"
+    intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION
+    intro = intro.format(full_output_type=full_output_type, config_class=config_class)
+    result = intro + params_docstring
+
+    # Apply minimum indent if necessary
+    if min_indent is not None:
+        lines = result.split("\n")
+        # Find the indent of the first nonempty line
+        i = 0
+        while len(lines[i]) == 0:
+            i += 1
+        indent = len(_get_indent(lines[i]))
+        # If too small, add indentation to all nonempty lines
+        if indent < min_indent:
+            to_add = " " * (min_indent - indent)
+            lines = [(f"{to_add}{line}" if len(line) > 0 else line) for line in lines]
+            result = "\n".join(lines)
+
+    return result
+
+
+PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
+
+    >>> outputs = model(**inputs, labels=labels)
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+PT_QUESTION_ANSWERING_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+
+    >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+    >>> inputs = tokenizer(question, text, return_tensors="pt")
+    >>> start_positions = torch.tensor([1])
+    >>> end_positions = torch.tensor([3])
+
+    >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
+    >>> loss = outputs.loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+
+    >>> start_scores = outputs.start_logits
+    >>> list(start_scores.shape)
+    {expected_output}
+
+    >>> end_scores = outputs.end_logits
+    >>> list(end_scores.shape)
+    {expected_output}
+    ```
+"""
+
+PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example of single-label classification:
+
+    ```python
+    >>> import torch
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=2)
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+    >>> outputs = model(**inputs, labels=labels)
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    >>> list(logits.shape)
+    {expected_output}
+    ```
+
+    Example of multi-label classification:
+
+    ```python
+    >>> import torch
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification", num_labels=2)
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    >>> labels = torch.tensor([[1, 1]], dtype=torch.float)  # need dtype=float for BCEWithLogitsLoss
+    >>> outputs = model(**inputs, labels=labels)
+    >>> loss = outputs.loss
+    >>> list(logits.shape)
+    {expected_output}
+    ```
+"""
+
+
+PT_MASKED_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
+    >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
+
+    >>> outputs = model(**inputs, labels=labels)
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+PT_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    >>> outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+PT_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> choice0 = "It is eaten with a fork and a knife."
+    >>> choice1 = "It is eaten while held in the hand."
+    >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
+    >>> outputs = model(**{{k: v.unsqueeze(0) for k, v in encoding.items()}}, labels=labels)  # batch size is 1
+
+    >>> # the linear classifier still needs to be trained
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+PT_CAUSAL_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    >>> outputs = model(**inputs, labels=inputs["input_ids"])
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+PT_SPEECH_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+    >>> with torch.no_grad():
+    ...     outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    >>> list(last_hidden_states.shape)
+    {expected_output}
+    ```
+"""
+
+PT_SPEECH_CTC_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+    >>> predicted_ids = torch.argmax(logits, dim=-1)
+
+    >>> # transcribe speech
+    >>> transcription = processor.batch_decode(predicted_ids)
+    >>> transcription[0]
+    {expected_output}
+    ```
+
+    ```python
+    >>> with processor.as_target_processor():
+    ...     inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids
+
+    >>> # compute loss
+    >>> loss = model(**inputs).loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+    ```
+"""
+
+PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> predicted_class_ids = torch.argmax(logits, dim=-1).item()
+    >>> predicted_label = model.config.id2label[predicted_class_ids]
+    >>> predicted_label
+    {expected_output}
+    ```
+
+    ```python
+    >>> # compute loss - target_label is e.g. "down"
+    >>> target_label = model.config.id2label[0]
+    >>> inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
+    >>> loss = model(**inputs).loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+    ```
+"""
+
+
+PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=sampling_rate)
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> probabilities = torch.sigmoid(logits[0])
+    >>> # labels is a one-hot array of shape (num_frames, num_speakers)
+    >>> labels = (probabilities > 0.5).long()
+    >>> labels[0].tolist()
+    {expected_output}
+    ```
+"""
+
+
+PT_SPEECH_XVECTOR_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = feature_extractor(
+    ...     [d["array"] for d in dataset[:2]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True
+    ... )
+    >>> with torch.no_grad():
+    ...     embeddings = model(**inputs).embeddings
+
+    >>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
+
+    >>> # the resulting embeddings can be used for cosine similarity-based retrieval
+    >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+    >>> similarity = cosine_sim(embeddings[0], embeddings[1])
+    >>> threshold = 0.7  # the optimal threshold is dataset-dependent
+    >>> if similarity < threshold:
+    ...     print("Speakers are not the same!")
+    >>> round(similarity.item(), 2)
+    {expected_output}
+    ```
+"""
+
+PT_VISION_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> image = dataset["test"]["image"][0]
+
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = feature_extractor(image, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    >>> list(last_hidden_states.shape)
+    {expected_output}
+    ```
+"""
+
+PT_VISION_SEQ_CLASS_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import torch
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("huggingface/cats-image")
+    >>> image = dataset["test"]["image"][0]
+
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = feature_extractor(image, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_label = logits.argmax(-1).item()
+    >>> print(model.config.id2label[predicted_label])
+    {expected_output}
+    ```
+"""
+
+
+PT_SAMPLE_DOCSTRINGS = {
+    "SequenceClassification": PT_SEQUENCE_CLASSIFICATION_SAMPLE,
+    "QuestionAnswering": PT_QUESTION_ANSWERING_SAMPLE,
+    "TokenClassification": PT_TOKEN_CLASSIFICATION_SAMPLE,
+    "MultipleChoice": PT_MULTIPLE_CHOICE_SAMPLE,
+    "MaskedLM": PT_MASKED_LM_SAMPLE,
+    "LMHead": PT_CAUSAL_LM_SAMPLE,
+    "BaseModel": PT_BASE_MODEL_SAMPLE,
+    "SpeechBaseModel": PT_SPEECH_BASE_MODEL_SAMPLE,
+    "CTC": PT_SPEECH_CTC_SAMPLE,
+    "AudioClassification": PT_SPEECH_SEQ_CLASS_SAMPLE,
+    "AudioFrameClassification": PT_SPEECH_FRAME_CLASS_SAMPLE,
+    "AudioXVector": PT_SPEECH_XVECTOR_SAMPLE,
+    "VisionBaseModel": PT_VISION_BASE_MODEL_SAMPLE,
+    "ImageClassification": PT_VISION_SEQ_CLASS_SAMPLE,
+}
+
+
+TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+    >>> input_ids = inputs["input_ids"]
+    >>> inputs["labels"] = tf.reshape(
+    ...     tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))
+    >>> )  # Batch size 1
+
+    >>> outputs = model(inputs)
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+TF_QUESTION_ANSWERING_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+    >>> input_dict = tokenizer(question, text, return_tensors="tf")
+    >>> outputs = model(input_dict)
+    >>> start_logits = outputs.start_logits
+    >>> end_logits = outputs.end_logits
+
+    >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+    >>> answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1])
+    ```
+"""
+
+TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+    >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1))  # Batch size 1
+
+    >>> outputs = model(inputs)
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+TF_MASKED_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
+    >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
+
+    >>> outputs = model(inputs)
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+TF_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+    >>> outputs = model(inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+TF_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> choice0 = "It is eaten with a fork and a knife."
+    >>> choice1 = "It is eaten while held in the hand."
+
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="tf", padding=True)
+    >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
+    >>> outputs = model(inputs)  # batch size is 1
+
+    >>> # the linear classifier still needs to be trained
+    >>> logits = outputs.logits
+    ```
+"""
+
+TF_CAUSAL_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+    >>> outputs = model(inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+TF_SAMPLE_DOCSTRINGS = {
+    "SequenceClassification": TF_SEQUENCE_CLASSIFICATION_SAMPLE,
+    "QuestionAnswering": TF_QUESTION_ANSWERING_SAMPLE,
+    "TokenClassification": TF_TOKEN_CLASSIFICATION_SAMPLE,
+    "MultipleChoice": TF_MULTIPLE_CHOICE_SAMPLE,
+    "MaskedLM": TF_MASKED_LM_SAMPLE,
+    "LMHead": TF_CAUSAL_LM_SAMPLE,
+    "BaseModel": TF_BASE_MODEL_SAMPLE,
+}
+
+
+FLAX_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_QUESTION_ANSWERING_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+    >>> inputs = tokenizer(question, text, return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> start_scores = outputs.start_logits
+    >>> end_scores = outputs.end_logits
+    ```
+"""
+
+FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_MASKED_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
+    >>> outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+FLAX_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> choice0 = "It is eaten with a fork and a knife."
+    >>> choice1 = "It is eaten while held in the hand."
+
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="jax", padding=True)
+    >>> outputs = model(**{{k: v[None, :] for k, v in encoding.items()}})
+
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_CAUSAL_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import {processor_class}, {model_class}
+
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
+
+    >>> # retrieve logts for next token
+    >>> next_token_logits = outputs.logits[:, -1]
+    ```
+"""
+
+FLAX_SAMPLE_DOCSTRINGS = {
+    "SequenceClassification": FLAX_SEQUENCE_CLASSIFICATION_SAMPLE,
+    "QuestionAnswering": FLAX_QUESTION_ANSWERING_SAMPLE,
+    "TokenClassification": FLAX_TOKEN_CLASSIFICATION_SAMPLE,
+    "MultipleChoice": FLAX_MULTIPLE_CHOICE_SAMPLE,
+    "MaskedLM": FLAX_MASKED_LM_SAMPLE,
+    "BaseModel": FLAX_BASE_MODEL_SAMPLE,
+    "LMHead": FLAX_CAUSAL_LM_SAMPLE,
+}
+
+
+def add_code_sample_docstrings(
+    *docstr,
+    processor_class=None,
+    checkpoint=None,
+    output_type=None,
+    config_class=None,
+    mask="[MASK]",
+    model_cls=None,
+    modality=None,
+    expected_output="",
+    expected_loss="",
+):
+    def docstring_decorator(fn):
+        # model_class defaults to function's class if not specified otherwise
+        model_class = fn.__qualname__.split(".")[0] if model_cls is None else model_cls
+
+        if model_class[:2] == "TF":
+            sample_docstrings = TF_SAMPLE_DOCSTRINGS
+        elif model_class[:4] == "Flax":
+            sample_docstrings = FLAX_SAMPLE_DOCSTRINGS
+        else:
+            sample_docstrings = PT_SAMPLE_DOCSTRINGS
+
+        # putting all kwargs for docstrings in a dict to be used
+        # with the `.format(**doc_kwargs)`. Note that string might
+        # be formatted with non-existing keys, which is fine.
+        doc_kwargs = dict(
+            model_class=model_class,
+            processor_class=processor_class,
+            checkpoint=checkpoint,
+            mask=mask,
+            expected_output=expected_output,
+            expected_loss=expected_loss,
+        )
+
+        if "SequenceClassification" in model_class and modality == "audio":
+            code_sample = sample_docstrings["AudioClassification"]
+        elif "SequenceClassification" in model_class:
+            code_sample = sample_docstrings["SequenceClassification"]
+        elif "QuestionAnswering" in model_class:
+            code_sample = sample_docstrings["QuestionAnswering"]
+        elif "TokenClassification" in model_class:
+            code_sample = sample_docstrings["TokenClassification"]
+        elif "MultipleChoice" in model_class:
+            code_sample = sample_docstrings["MultipleChoice"]
+        elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]:
+            code_sample = sample_docstrings["MaskedLM"]
+        elif "LMHead" in model_class or "CausalLM" in model_class:
+            code_sample = sample_docstrings["LMHead"]
+        elif "CTC" in model_class:
+            code_sample = sample_docstrings["CTC"]
+        elif "AudioFrameClassification" in model_class:
+            code_sample = sample_docstrings["AudioFrameClassification"]
+        elif "XVector" in model_class and modality == "audio":
+            code_sample = sample_docstrings["AudioXVector"]
+        elif "Model" in model_class and modality == "audio":
+            code_sample = sample_docstrings["SpeechBaseModel"]
+        elif "Model" in model_class and modality == "vision":
+            code_sample = sample_docstrings["VisionBaseModel"]
+        elif "Model" in model_class or "Encoder" in model_class:
+            code_sample = sample_docstrings["BaseModel"]
+        elif "ImageClassification" in model_class:
+            code_sample = sample_docstrings["ImageClassification"]
+        else:
+            raise ValueError(f"Docstring can't be built for model {model_class}")
+
+        func_doc = (fn.__doc__ or "") + "".join(docstr)
+        output_doc = "" if output_type is None else _prepare_output_docstrings(output_type, config_class)
+        built_doc = code_sample.format(**doc_kwargs)
+        fn.__doc__ = func_doc + output_doc + built_doc
+        return fn
+
+    return docstring_decorator
+
+
+def replace_return_docstrings(output_type=None, config_class=None):
+    def docstring_decorator(fn):
+        func_doc = fn.__doc__
+        lines = func_doc.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            indent = len(_get_indent(lines[i]))
+            lines[i] = _prepare_output_docstrings(output_type, config_class, min_indent=indent)
+            func_doc = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, "
+                f"current docstring is:\n{func_doc}"
+            )
+        fn.__doc__ = func_doc
+        return fn
+
+    return docstring_decorator
+
+
+def copy_func(f):
+    """Returns a copy of a function f."""
+    # Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)
+    g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__, closure=f.__closure__)
+    g = functools.update_wrapper(g, f)
+    g.__kwdefaults__ = f.__kwdefaults__
+    return g
diff --git a/src/transformers/utils/dummy_detectron2_objects.py b/src/transformers/utils/dummy_detectron2_objects.py
index e1caea2314..41dfb6f81d 100644
--- a/src/transformers/utils/dummy_detectron2_objects.py
+++ b/src/transformers/utils/dummy_detectron2_objects.py
@@ -1,5 +1,5 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_backends
+from ..utils import requires_backends
 
 
 LAYOUTLM_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index 166cecaeba..6311437b8e 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class FlaxForcedBOSTokenLogitsProcessor(metaclass=DummyObject):
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 9c6e994115..8b20b185a3 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class PyTorchBenchmark(metaclass=DummyObject):
diff --git a/src/transformers/utils/dummy_scatter_objects.py b/src/transformers/utils/dummy_scatter_objects.py
index abe9be04d1..3f25018b53 100644
--- a/src/transformers/utils/dummy_scatter_objects.py
+++ b/src/transformers/utils/dummy_scatter_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = None
diff --git a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
index 53e2502dab..b9b971f1f1 100644
--- a/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_and_speech_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class Speech2TextProcessor(metaclass=DummyObject):
diff --git a/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py b/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
index 89efff7123..476117fea6 100644
--- a/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 SLOW_TO_FAST_CONVERTERS = None
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
index b358e6d269..37d52fc094 100644
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ b/src/transformers/utils/dummy_sentencepiece_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class AlbertTokenizer(metaclass=DummyObject):
diff --git a/src/transformers/utils/dummy_speech_objects.py b/src/transformers/utils/dummy_speech_objects.py
index a1fd102aab..721fe80a79 100644
--- a/src/transformers/utils/dummy_speech_objects.py
+++ b/src/transformers/utils/dummy_speech_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class Speech2TextFeatureExtractor(metaclass=DummyObject):
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 631b4a9a5d..39132d6c6c 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class TensorFlowBenchmarkArguments(metaclass=DummyObject):
diff --git a/src/transformers/utils/dummy_timm_and_vision_objects.py b/src/transformers/utils/dummy_timm_and_vision_objects.py
index 86badb8746..9a63196637 100644
--- a/src/transformers/utils/dummy_timm_and_vision_objects.py
+++ b/src/transformers/utils/dummy_timm_and_vision_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
diff --git a/src/transformers/utils/dummy_timm_objects.py b/src/transformers/utils/dummy_timm_objects.py
index 2893931fea..c964d40315 100644
--- a/src/transformers/utils/dummy_timm_objects.py
+++ b/src/transformers/utils/dummy_timm_objects.py
@@ -1,5 +1,5 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..file_utils import requires_backends
+from ..utils import requires_backends
 
 
 DETR_PRETRAINED_MODEL_ARCHIVE_LIST = None
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 28076c848e..5ff4d3a4a4 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class AlbertTokenizerFast(metaclass=DummyObject):
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index a5d0b4f5d0..f0e19d35f3 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -1,6 +1,6 @@
 # This file is autogenerated by the command `make fix-copies`, do not edit.
 # flake8: noqa
-from ..file_utils import DummyObject, requires_backends
+from ..utils import DummyObject, requires_backends
 
 
 class ImageFeatureExtractionMixin(metaclass=DummyObject):
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 393b6bfb75..0cfb0b10d5 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -45,8 +45,9 @@ from .. import (
     XLNetForQuestionAnswering,
     logging,
 )
-from ..file_utils import TORCH_FX_REQUIRED_VERSION, importlib_metadata, is_torch_fx_available
 from ..models.auto import get_values
+from ..utils import TORCH_FX_REQUIRED_VERSION, is_torch_fx_available
+from ..utils.versions import importlib_metadata
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
new file mode 100644
index 0000000000..f0be710dd8
--- /dev/null
+++ b/src/transformers/utils/generic.py
@@ -0,0 +1,291 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Generic utilities
+"""
+
+from collections import OrderedDict, UserDict
+from contextlib import ExitStack
+from dataclasses import fields
+from enum import Enum
+from typing import Any, ContextManager, List, Tuple
+
+import numpy as np
+
+from .import_utils import is_flax_available, is_tf_available, is_torch_available, is_torch_fx_proxy
+
+
+class cached_property(property):
+    """
+    Descriptor that mimics @property but caches output in member variable.
+
+    From tensorflow_datasets
+
+    Built-in in functools from Python 3.8.
+    """
+
+    def __get__(self, obj, objtype=None):
+        # See docs.python.org/3/howto/descriptor.html#properties
+        if obj is None:
+            return self
+        if self.fget is None:
+            raise AttributeError("unreadable attribute")
+        attr = "__cached_" + self.fget.__name__
+        cached = getattr(obj, attr, None)
+        if cached is None:
+            cached = self.fget(obj)
+            setattr(obj, attr, cached)
+        return cached
+
+
+def is_tensor(x):
+    """
+    Tests if `x` is a `torch.Tensor`, `tf.Tensor`, `jaxlib.xla_extension.DeviceArray` or `np.ndarray`.
+    """
+    if is_torch_fx_proxy(x):
+        return True
+    if is_torch_available():
+        import torch
+
+        if isinstance(x, torch.Tensor):
+            return True
+    if is_tf_available():
+        import tensorflow as tf
+
+        if isinstance(x, tf.Tensor):
+            return True
+
+    if is_flax_available():
+        import jax.numpy as jnp
+        from jax.core import Tracer
+
+        if isinstance(x, (jnp.ndarray, Tracer)):
+            return True
+
+    return isinstance(x, np.ndarray)
+
+
+def _is_numpy(x):
+    return isinstance(x, np.ndarray)
+
+
+def _is_torch(x):
+    import torch
+
+    return isinstance(x, torch.Tensor)
+
+
+def _is_torch_device(x):
+    import torch
+
+    return isinstance(x, torch.device)
+
+
+def _is_tensorflow(x):
+    import tensorflow as tf
+
+    return isinstance(x, tf.Tensor)
+
+
+def _is_jax(x):
+    import jax.numpy as jnp  # noqa: F811
+
+    return isinstance(x, jnp.ndarray)
+
+
+def to_py_obj(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
+    """
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return [to_py_obj(o) for o in obj]
+    elif is_tf_available() and _is_tensorflow(obj):
+        return obj.numpy().tolist()
+    elif is_torch_available() and _is_torch(obj):
+        return obj.detach().cpu().tolist()
+    elif is_flax_available() and _is_jax(obj):
+        return np.asarray(obj).tolist()
+    elif isinstance(obj, (np.ndarray, np.number)):  # tolist also works on 0d np arrays
+        return obj.tolist()
+    else:
+        return obj
+
+
+def to_numpy(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
+    """
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_numpy(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return np.array(obj)
+    elif is_tf_available() and _is_tensorflow(obj):
+        return obj.numpy()
+    elif is_torch_available() and _is_torch(obj):
+        return obj.detach().cpu().numpy()
+    elif is_flax_available() and _is_jax(obj):
+        return np.asarray(obj)
+    else:
+        return obj
+
+
+class ModelOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a `ModelOutput` directly. Use the [`~file_utils.ModelOutput.to_tuple`] method to convert it to a
+    tuple before.
+
+    </Tip>
+    """
+
+    def __post_init__(self):
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        if not len(class_fields):
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+        if not all(field.default is None for field in class_fields[1:]):
+            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and not is_tensor(first_field):
+            if isinstance(first_field, dict):
+                iterator = first_field.items()
+                first_field_iterator = True
+            else:
+                try:
+                    iterator = iter(first_field)
+                    first_field_iterator = True
+                except TypeError:
+                    first_field_iterator = False
+
+            # if we provided an iterator as first field and the iterator is a (key, value) iterator
+            # set the associated fields
+            if first_field_iterator:
+                for element in iterator:
+                    if (
+                        not isinstance(element, (list, tuple))
+                        or not len(element) == 2
+                        or not isinstance(element[0], str)
+                    ):
+                        break
+                    setattr(self, element[0], element[1])
+                    if element[1] is not None:
+                        self[element[0]] = element[1]
+            elif first_field is not None:
+                self[class_fields[0].name] = first_field
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = {k: v for (k, v) in self.items()}
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def to_tuple(self) -> Tuple[Any]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
+    IDE.
+    """
+
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+
+
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
+    tab-completion in an IDE.
+    """
+
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+    JAX = "jax"
+
+
+class ContextManagers:
+    """
+    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
+    in the `fastcore` library.
+    """
+
+    def __init__(self, context_managers: List[ContextManager]):
+        self.context_managers = context_managers
+        self.stack = ExitStack()
+
+    def __enter__(self):
+        for context_manager in self.context_managers:
+            self.stack.enter_context(context_manager)
+
+    def __exit__(self, *args, **kwargs):
+        self.stack.__exit__(*args, **kwargs)
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
new file mode 100644
index 0000000000..0dfd6ef96b
--- /dev/null
+++ b/src/transformers/utils/hub.py
@@ -0,0 +1,1023 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Hub utilities: utilities related to download and cache models
+"""
+import copy
+import fnmatch
+import io
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tarfile
+import tempfile
+import warnings
+from contextlib import contextmanager
+from functools import partial
+from hashlib import sha256
+from pathlib import Path
+from typing import BinaryIO, Dict, List, Optional, Tuple, Union
+from urllib.parse import urlparse
+from uuid import uuid4
+from zipfile import ZipFile, is_zipfile
+
+import requests
+from filelock import FileLock
+from huggingface_hub import HfFolder, Repository, create_repo, list_repo_files, whoami
+from requests.exceptions import HTTPError
+from transformers.utils.logging import tqdm
+
+from . import __version__, logging
+from .import_utils import (
+    ENV_VARS_TRUE_VALUES,
+    _tf_version,
+    _torch_version,
+    is_tf_available,
+    is_torch_available,
+    is_training_run_on_sagemaker,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+_is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False
+
+
+def is_offline_mode():
+    return _is_offline_mode
+
+
+torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+old_default_cache_path = os.path.join(torch_cache_home, "transformers")
+# New default cache, shared with the Datasets library
+hf_cache_home = os.path.expanduser(
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+default_cache_path = os.path.join(hf_cache_home, "transformers")
+
+# Onetime move from the old location to the new one if no ENV variable has been set.
+if (
+    os.path.isdir(old_default_cache_path)
+    and not os.path.isdir(default_cache_path)
+    and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
+    and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
+    and "TRANSFORMERS_CACHE" not in os.environ
+):
+    logger.warning(
+        "In Transformers v4.0.0, the default path to cache downloaded models changed from "
+        "'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
+        "and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
+        "'~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should "
+        "only see this message once."
+    )
+    shutil.move(old_default_cache_path, default_cache_path)
+
+PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
+TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
+SESSION_ID = uuid4().hex
+DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False) in ENV_VARS_TRUE_VALUES
+
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
+
+_staging_mode = os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES
+_default_endpoint = "https://moon-staging.huggingface.co" if _staging_mode else "https://huggingface.co"
+
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = _default_endpoint
+if os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", None) is not None:
+    warnings.warn(
+        "Using the environment variable `HUGGINGFACE_CO_RESOLVE_ENDPOINT` is deprecated and will be removed in "
+        "Transformers v5. Use `HF_ENDPOINT` instead.",
+        FutureWarning,
+    )
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", None)
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", HUGGINGFACE_CO_RESOLVE_ENDPOINT)
+HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}"
+
+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+
+
+def hf_bucket_url(
+    model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None
+) -> str:
+    """
+    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting
+    to Cloudfront (a Content Delivery Network, or CDN) for large files.
+
+    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
+    bandwidth costs).
+
+    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
+    because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront
+    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
+    can't ever be stale.
+
+    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is:
+    its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0
+    are not shared with those new files, because the cached file's name contains a hash of the url (which changed).
+    """
+    if subfolder is not None:
+        filename = f"{subfolder}/{filename}"
+
+    if mirror:
+        if mirror in ["tuna", "bfsu"]:
+            raise ValueError("The Tuna and BFSU mirrors are no longer available. Try removing the mirror argument.")
+        legacy_format = "/" not in model_id
+        if legacy_format:
+            return f"{mirror}/{model_id}-{filename}"
+        else:
+            return f"{mirror}/{model_id}/{filename}"
+
+    if revision is None:
+        revision = "main"
+    return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
+
+
+def url_to_filename(url: str, etag: Optional[str] = None) -> str:
+    """
+    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
+    delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
+    identify it as a HDF5 file (see
+    https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+    """
+    url_bytes = url.encode("utf-8")
+    filename = sha256(url_bytes).hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        filename += "." + sha256(etag_bytes).hexdigest()
+
+    if url.endswith(".h5"):
+        filename += ".h5"
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be `None`) stored for *filename*. Raise `EnvironmentError` if *filename* or its
+    stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError(f"file {cache_path} not found")
+
+    meta_path = cache_path + ".json"
+    if not os.path.exists(meta_path):
+        raise EnvironmentError(f"file {meta_path} not found")
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata["url"]
+    etag = metadata["etag"]
+
+    return url, etag
+
+
+def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
+    """
+    Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape `(model_url,
+    etag, size_MB)`. Filenames in `cache_dir` are use to get the metadata for each model, only urls ending with *.bin*
+    are added.
+
+    Args:
+        cache_dir (`Union[str, Path]`, *optional*):
+            The cache directory to search for models within. Will default to the transformers cache if unset.
+
+    Returns:
+        List[Tuple]: List of tuples each with shape `(model_url, etag, size_MB)`
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    elif isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cached_models = []
+    for file in os.listdir(cache_dir):
+        if file.endswith(".json"):
+            meta_path = os.path.join(cache_dir, file)
+            with open(meta_path, encoding="utf-8") as meta_file:
+                metadata = json.load(meta_file)
+                url = metadata["url"]
+                etag = metadata["etag"]
+                if url.endswith(".bin"):
+                    size_MB = os.path.getsize(meta_path.strip(".json")) / 1e6
+                    cached_models.append((url, etag, size_MB))
+
+    return cached_models
+
+
+def cached_path(
+    url_or_filename,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    resume_download=False,
+    user_agent: Union[Dict, str, None] = None,
+    extract_compressed_file=False,
+    force_extract=False,
+    use_auth_token: Union[bool, str, None] = None,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file
+    and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and
+    then return the path
+
+    Args:
+        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+        force_download: if True, re-download the file even if it's already cached in the cache dir.
+        resume_download: if True, resume the download if incompletely received file is found.
+        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
+        use_auth_token: Optional string or boolean to use as Bearer token for remote files. If True,
+            will get token from ~/.huggingface.
+        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
+            file in a folder along the archive.
+        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
+            re-extract the archive and override the folder where it was extracted.
+
+    Return:
+        Local path (string) of file or if networking is off, last version of file cached on disk.
+
+    Raises:
+        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if is_offline_mode() and not local_files_only:
+        logger.info("Offline mode: forcing local_files_only=True")
+        local_files_only = True
+
+    if is_remote_url(url_or_filename):
+        # URL, so get it from the cache (downloading if necessary)
+        output_path = get_from_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            user_agent=user_agent,
+            use_auth_token=use_auth_token,
+            local_files_only=local_files_only,
+        )
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        output_path = url_or_filename
+    elif urlparse(url_or_filename).scheme == "":
+        # File, but it doesn't exist.
+        raise EnvironmentError(f"file {url_or_filename} not found")
+    else:
+        # Something unknown
+        raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path")
+
+    if extract_compressed_file:
+        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
+            return output_path
+
+        # Path where we extract compressed archives
+        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
+        output_dir, output_file = os.path.split(output_path)
+        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+
+        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
+            return output_path_extracted
+
+        # Prevent parallel extractions
+        lock_path = output_path + ".lock"
+        with FileLock(lock_path):
+            shutil.rmtree(output_path_extracted, ignore_errors=True)
+            os.makedirs(output_path_extracted)
+            if is_zipfile(output_path):
+                with ZipFile(output_path, "r") as zip_file:
+                    zip_file.extractall(output_path_extracted)
+                    zip_file.close()
+            elif tarfile.is_tarfile(output_path):
+                tar_file = tarfile.open(output_path)
+                tar_file.extractall(output_path_extracted)
+                tar_file.close()
+            else:
+                raise EnvironmentError(f"Archive format of {output_path} could not be identified")
+
+        return output_path_extracted
+
+    return output_path
+
+
+def define_sagemaker_information():
+    try:
+        instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json()
+        dlc_container_used = instance_data["Image"]
+        dlc_tag = instance_data["Image"].split(":")[1]
+    except Exception:
+        dlc_container_used = None
+        dlc_tag = None
+
+    sagemaker_params = json.loads(os.getenv("SM_FRAMEWORK_PARAMS", "{}"))
+    runs_distributed_training = True if "sagemaker_distributed_dataparallel_enabled" in sagemaker_params else False
+    account_id = os.getenv("TRAINING_JOB_ARN").split(":")[4] if "TRAINING_JOB_ARN" in os.environ else None
+
+    sagemaker_object = {
+        "sm_framework": os.getenv("SM_FRAMEWORK_MODULE", None),
+        "sm_region": os.getenv("AWS_REGION", None),
+        "sm_number_gpu": os.getenv("SM_NUM_GPUS", 0),
+        "sm_number_cpu": os.getenv("SM_NUM_CPUS", 0),
+        "sm_distributed_training": runs_distributed_training,
+        "sm_deep_learning_container": dlc_container_used,
+        "sm_deep_learning_container_tag": dlc_tag,
+        "sm_account_id": account_id,
+    }
+    return sagemaker_object
+
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
+    ua = f"transformers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
+    if is_torch_available():
+        ua += f"; torch/{_torch_version}"
+    if is_tf_available():
+        ua += f"; tensorflow/{_tf_version}"
+    if DISABLE_TELEMETRY:
+        return ua + "; telemetry/off"
+    if is_training_run_on_sagemaker():
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items())
+    # CI will set this value to True
+    if os.environ.get("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+        ua += "; is_ci/true"
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
+
+
+class RepositoryNotFoundError(HTTPError):
+    """
+    Raised when trying to access a hf.co URL with an invalid repository name, or with a private repo name the user does
+    not have access to.
+    """
+
+
+class EntryNotFoundError(HTTPError):
+    """Raised when trying to access a hf.co URL with a valid repository and revision but an invalid filename."""
+
+
+class RevisionNotFoundError(HTTPError):
+    """Raised when trying to access a hf.co URL with a valid repository but an invalid revision."""
+
+
+def _raise_for_status(request):
+    """
+    Internal version of `request.raise_for_status()` that will refine a potential HTTPError.
+    """
+    if "X-Error-Code" in request.headers:
+        error_code = request.headers["X-Error-Code"]
+        if error_code == "RepoNotFound":
+            raise RepositoryNotFoundError(f"404 Client Error: Repository Not Found for url: {request.url}")
+        elif error_code == "EntryNotFound":
+            raise EntryNotFoundError(f"404 Client Error: Entry Not Found for url: {request.url}")
+        elif error_code == "RevisionNotFound":
+            raise RevisionNotFoundError((f"404 Client Error: Revision Not Found for url: {request.url}"))
+
+    request.raise_for_status()
+
+
+def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None):
+    """
+    Download remote file. Do not gobble up errors.
+    """
+    headers = copy.deepcopy(headers)
+    if resume_size > 0:
+        headers["Range"] = f"bytes={resume_size}-"
+    r = requests.get(url, stream=True, proxies=proxies, headers=headers)
+    _raise_for_status(r)
+    content_length = r.headers.get("Content-Length")
+    total = resume_size + int(content_length) if content_length is not None else None
+    # `tqdm` behavior is determined by `utils.logging.is_progress_bar_enabled()`
+    # and can be set using `utils.logging.enable/disable_progress_bar()`
+    progress = tqdm(
+        unit="B",
+        unit_scale=True,
+        unit_divisor=1024,
+        total=total,
+        initial=resume_size,
+        desc="Downloading",
+    )
+    for chunk in r.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(
+    url: str,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    etag_timeout=10,
+    resume_download=False,
+    user_agent: Union[Dict, str, None] = None,
+    use_auth_token: Union[bool, str, None] = None,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
+    path to the cached file.
+
+    Return:
+        Local path (string) of file or if networking is off, last version of file cached on disk.
+
+    Raises:
+        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    headers = {"user-agent": http_user_agent(user_agent)}
+    if isinstance(use_auth_token, str):
+        headers["authorization"] = f"Bearer {use_auth_token}"
+    elif use_auth_token:
+        token = HfFolder.get_token()
+        if token is None:
+            raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
+        headers["authorization"] = f"Bearer {token}"
+
+    url_to_download = url
+    etag = None
+    if not local_files_only:
+        try:
+            r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
+            _raise_for_status(r)
+            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+            # We favor a custom header indicating the etag of the linked resource, and
+            # we fallback to the regular etag header.
+            # If we don't have any of those, raise an error.
+            if etag is None:
+                raise OSError(
+                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+                )
+            # In case of a redirect,
+            # save an extra redirect on the request.get call,
+            # and ensure we download the exact atomic version even if it changed
+            # between the HEAD and the GET (unlikely, but hey).
+            if 300 <= r.status_code <= 399:
+                url_to_download = r.headers["Location"]
+        except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+            # Actually raise for those subclasses of ConnectionError
+            raise
+        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+            # Otherwise, our Internet connection is down.
+            # etag is None
+            pass
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # etag is None == we don't have a connection or we passed local_files_only.
+    # try to get the last downloaded one
+    if etag is None:
+        if os.path.exists(cache_path):
+            return cache_path
+        else:
+            matching_files = [
+                file
+                for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
+                if not file.endswith(".json") and not file.endswith(".lock")
+            ]
+            if len(matching_files) > 0:
+                return os.path.join(cache_dir, matching_files[-1])
+            else:
+                # If files cannot be found and local_files_only=True,
+                # the models might've been found if local_files_only=False
+                # Notify the user about that
+                if local_files_only:
+                    raise FileNotFoundError(
+                        "Cannot find the requested files in the cached path and outgoing traffic has been"
+                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+                        " to False."
+                    )
+                else:
+                    raise ValueError(
+                        "Connection error, and we cannot find the requested files in the cached path."
+                        " Please try again or make sure your Internet connection is on."
+                    )
+
+    # From now on, etag is not None.
+    if os.path.exists(cache_path) and not force_download:
+        return cache_path
+
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with FileLock(lock_path):
+
+        # If the download just completed while the lock was activated.
+        if os.path.exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
+
+        if resume_download:
+            incomplete_path = cache_path + ".incomplete"
+
+            @contextmanager
+            def _resumable_file_manager() -> "io.BufferedWriter":
+                with open(incomplete_path, "ab") as f:
+                    yield f
+
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
+            resume_size = 0
+
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}")
+
+            http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers)
+
+        logger.info(f"storing {url} in cache at {cache_path}")
+        os.replace(temp_file.name, cache_path)
+
+        # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it.
+        umask = os.umask(0o666)
+        os.umask(umask)
+        os.chmod(cache_path, 0o666 & ~umask)
+
+        logger.info(f"creating metadata file for {cache_path}")
+        meta = {"url": url, "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w") as meta_file:
+            json.dump(meta, meta_file)
+
+    return cache_path
+
+
+def get_file_from_repo(
+    path_or_repo: Union[str, os.PathLike],
+    filename: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+):
+    """
+    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
+
+    Args:
+        path_or_repo (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a model repo on huggingface.co.
+            - a path to a *directory* potentially containing the file.
+        filename (`str`):
+            The name of the file to locate in `path_or_repo`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    Passing `use_auth_token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo) or `None` if the
+        file does not exist.
+
+    Examples:
+
+    ```python
+    # Download a tokenizer configuration from huggingface.co and cache.
+    tokenizer_config = get_file_from_repo("bert-base-uncased", "tokenizer_config.json")
+    # This model does not have a tokenizer config so the result will be None.
+    tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
+    ```"""
+    if is_offline_mode() and not local_files_only:
+        logger.info("Offline mode: forcing local_files_only=True")
+        local_files_only = True
+
+    path_or_repo = str(path_or_repo)
+    if os.path.isdir(path_or_repo):
+        resolved_file = os.path.join(path_or_repo, filename)
+        return resolved_file if os.path.isfile(resolved_file) else None
+    else:
+        resolved_file = hf_bucket_url(path_or_repo, filename=filename, revision=revision, mirror=None)
+
+    try:
+        # Load from URL or cache if already cached
+        resolved_file = cached_path(
+            resolved_file,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+        )
+
+    except RepositoryNotFoundError:
+        raise EnvironmentError(
+            f"{path_or_repo} is not a local folder and is not a valid model identifier "
+            "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
+            "pass a token having permission to this repo with `use_auth_token` or log in with "
+            "`huggingface-cli login` and pass `use_auth_token=True`."
+        )
+    except RevisionNotFoundError:
+        raise EnvironmentError(
+            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
+            "for this model name. Check the model page at "
+            f"'https://huggingface.co/{path_or_repo}' for available revisions."
+        )
+    except EnvironmentError:
+        # The repo and revision exist, but the file does not or there was a connection error fetching it.
+        return None
+
+    return resolved_file
+
+
+def has_file(
+    path_or_repo: Union[str, os.PathLike],
+    filename: str,
+    revision: Optional[str] = None,
+    mirror: Optional[str] = None,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+):
+    """
+    Checks if a repo contains a given file wihtout downloading it. Works for remote repos and local folders.
+
+    <Tip warning={false}>
+
+    This function will raise an error if the repository `path_or_repo` is not valid or if `revision` does not exist for
+    this repo, but will return False for regular connection errors.
+
+    </Tip>
+    """
+    if os.path.isdir(path_or_repo):
+        return os.path.isfile(os.path.join(path_or_repo, filename))
+
+    url = hf_bucket_url(path_or_repo, filename=filename, revision=revision, mirror=mirror)
+
+    headers = {"user-agent": http_user_agent()}
+    if isinstance(use_auth_token, str):
+        headers["authorization"] = f"Bearer {use_auth_token}"
+    elif use_auth_token:
+        token = HfFolder.get_token()
+        if token is None:
+            raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
+        headers["authorization"] = f"Bearer {token}"
+
+    r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=10)
+    try:
+        _raise_for_status(r)
+        return True
+    except RepositoryNotFoundError as e:
+        logger.error(e)
+        raise EnvironmentError(f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'.")
+    except RevisionNotFoundError as e:
+        logger.error(e)
+        raise EnvironmentError(
+            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
+            "model name. Check the model page at 'https://huggingface.co/{path_or_repo}' for available revisions."
+        )
+    except requests.HTTPError:
+        # We return false for EntryNotFoundError (logical) as well as any connection error.
+        return False
+
+
+def get_list_of_files(
+    path_or_repo: Union[str, os.PathLike],
+    revision: Optional[str] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    local_files_only: bool = False,
+) -> List[str]:
+    """
+    Gets the list of files inside `path_or_repo`.
+
+    Args:
+        path_or_repo (`str` or `os.PathLike`):
+            Can be either the id of a repo on huggingface.co or a path to a *directory*.
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            Whether or not to only rely on local files and not to attempt to download any files.
+
+    <Tip warning={true}>
+
+    This API is not optimized, so calling it a lot may result in connection errors.
+
+    </Tip>
+
+    Returns:
+        `List[str]`: The list of files available in `path_or_repo`.
+    """
+    path_or_repo = str(path_or_repo)
+    # If path_or_repo is a folder, we just return what is inside (subdirectories included).
+    if os.path.isdir(path_or_repo):
+        list_of_files = []
+        for path, dir_names, file_names in os.walk(path_or_repo):
+            list_of_files.extend([os.path.join(path, f) for f in file_names])
+        return list_of_files
+
+    # Can't grab the files if we are on offline mode.
+    if is_offline_mode() or local_files_only:
+        return []
+
+    # Otherwise we grab the token and use the list_repo_files method.
+    if isinstance(use_auth_token, str):
+        token = use_auth_token
+    elif use_auth_token is True:
+        token = HfFolder.get_token()
+    else:
+        token = None
+
+    try:
+        return list_repo_files(path_or_repo, revision=revision, token=token)
+    except HTTPError as e:
+        raise ValueError(
+            f"{path_or_repo} is not a local path or a model identifier on the model Hub. Did you make a typo?"
+        ) from e
+
+
+def is_local_clone(repo_path, repo_url):
+    """
+    Checks if the folder in `repo_path` is a local clone of `repo_url`.
+    """
+    # First double-check that `repo_path` is a git repo
+    if not os.path.exists(os.path.join(repo_path, ".git")):
+        return False
+    test_git = subprocess.run("git branch".split(), cwd=repo_path)
+    if test_git.returncode != 0:
+        return False
+
+    # Then look at its remotes
+    remotes = subprocess.run(
+        "git remote -v".split(),
+        stderr=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        check=True,
+        encoding="utf-8",
+        cwd=repo_path,
+    ).stdout
+
+    return repo_url in remotes.split()
+
+
+class PushToHubMixin:
+    """
+    A Mixin containing the functionality to push a model or tokenizer to the hub.
+    """
+
+    def push_to_hub(
+        self,
+        repo_path_or_name: Optional[str] = None,
+        repo_url: Optional[str] = None,
+        use_temp_dir: bool = False,
+        commit_message: Optional[str] = None,
+        organization: Optional[str] = None,
+        private: Optional[bool] = None,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        **model_card_kwargs
+    ) -> str:
+        """
+        Upload the {object_files} to the 🤗 Model Hub while synchronizing a local clone of the repo in
+        `repo_path_or_name`.
+
+        Parameters:
+            repo_path_or_name (`str`, *optional*):
+                Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case
+                the repository will have the name of that local folder). If not specified, will default to the name
+                given by `repo_url` and a local directory with that name will be created.
+            repo_url (`str`, *optional*):
+                Specify this in case you want to push to an existing repository in the hub. If unspecified, a new
+                repository will be created in your namespace (unless you specify an `organization`) with `repo_name`.
+            use_temp_dir (`bool`, *optional*, defaults to `False`):
+                Whether or not to clone the distant repo in a temporary directory or in `repo_path_or_name` inside the
+                current working directory. This will slow things down if you are making changes in an existing repo
+                since you will need to clone the repo before every push.
+            commit_message (`str`, *optional*):
+                Message to commit while pushing. Will default to `"add {object}"`.
+            organization (`str`, *optional*):
+                Organization in which you want to push your {object} (you must be a member of this organization).
+            private (`bool`, *optional*):
+                Whether or not the repository created should be private (requires a paying subscription).
+            use_auth_token (`bool` or `str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`). Will default to `True` if
+                `repo_url` is not specified.
+
+
+        Returns:
+            `str`: The url of the commit of your {object} in the given repository.
+
+        Examples:
+
+        ```python
+        from transformers import {object_class}
+
+        {object} = {object_class}.from_pretrained("bert-base-cased")
+
+        # Push the {object} to your namespace with the name "my-finetuned-bert" and have a local clone in the
+        # *my-finetuned-bert* folder.
+        {object}.push_to_hub("my-finetuned-bert")
+
+        # Push the {object} to your namespace with the name "my-finetuned-bert" with no local clone.
+        {object}.push_to_hub("my-finetuned-bert", use_temp_dir=True)
+
+        # Push the {object} to an organization with the name "my-finetuned-bert" and have a local clone in the
+        # *my-finetuned-bert* folder.
+        {object}.push_to_hub("my-finetuned-bert", organization="huggingface")
+
+        # Make a change to an existing repo that has been cloned locally in *my-finetuned-bert*.
+        {object}.push_to_hub("my-finetuned-bert", repo_url="https://huggingface.co/sgugger/my-finetuned-bert")
+        ```
+        """
+        if use_temp_dir:
+            # Make sure we use the right `repo_name` for the `repo_url` before replacing it.
+            if repo_url is None:
+                if use_auth_token is None:
+                    use_auth_token = True
+                repo_name = Path(repo_path_or_name).name
+                repo_url = self._get_repo_url_from_name(
+                    repo_name, organization=organization, private=private, use_auth_token=use_auth_token
+                )
+            repo_path_or_name = tempfile.mkdtemp()
+
+        # Create or clone the repo. If the repo is already cloned, this just retrieves the path to the repo.
+        repo = self._create_or_get_repo(
+            repo_path_or_name=repo_path_or_name,
+            repo_url=repo_url,
+            organization=organization,
+            private=private,
+            use_auth_token=use_auth_token,
+        )
+        # Save the files in the cloned repo
+        self.save_pretrained(repo_path_or_name)
+        if hasattr(self, "history") and hasattr(self, "create_model_card"):
+            # This is a Keras model and we might be able to fish out its History and make a model card out of it
+            base_model_card_args = {
+                "output_dir": repo_path_or_name,
+                "model_name": Path(repo_path_or_name).name,
+            }
+            base_model_card_args.update(model_card_kwargs)
+            self.create_model_card(**base_model_card_args)
+        # Commit and push!
+        url = self._push_to_hub(repo, commit_message=commit_message)
+
+        # Clean up! Clean up! Everybody everywhere!
+        if use_temp_dir:
+            shutil.rmtree(repo_path_or_name)
+
+        return url
+
+    @staticmethod
+    def _get_repo_url_from_name(
+        repo_name: str,
+        organization: Optional[str] = None,
+        private: bool = None,
+        use_auth_token: Optional[Union[bool, str]] = None,
+    ) -> str:
+        if isinstance(use_auth_token, str):
+            token = use_auth_token
+        elif use_auth_token:
+            token = HfFolder.get_token()
+            if token is None:
+                raise ValueError(
+                    "You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and "
+                    "entering your credentials to use `use_auth_token=True`. Alternatively, you can pass your own "
+                    "token as the `use_auth_token` argument."
+                )
+        else:
+            token = None
+
+        # Special provision for the test endpoint (CI)
+        return create_repo(
+            token,
+            repo_name,
+            organization=organization,
+            private=private,
+            repo_type=None,
+            exist_ok=True,
+        )
+
+    @classmethod
+    def _create_or_get_repo(
+        cls,
+        repo_path_or_name: Optional[str] = None,
+        repo_url: Optional[str] = None,
+        organization: Optional[str] = None,
+        private: bool = None,
+        use_auth_token: Optional[Union[bool, str]] = None,
+    ) -> Repository:
+        if repo_path_or_name is None and repo_url is None:
+            raise ValueError("You need to specify a `repo_path_or_name` or a `repo_url`.")
+
+        if use_auth_token is None and repo_url is None:
+            use_auth_token = True
+
+        if repo_path_or_name is None:
+            repo_path_or_name = repo_url.split("/")[-1]
+
+        if repo_url is None and not os.path.exists(repo_path_or_name):
+            repo_name = Path(repo_path_or_name).name
+            repo_url = cls._get_repo_url_from_name(
+                repo_name, organization=organization, private=private, use_auth_token=use_auth_token
+            )
+
+        # Create a working directory if it does not exist.
+        if not os.path.exists(repo_path_or_name):
+            os.makedirs(repo_path_or_name)
+
+        repo = Repository(repo_path_or_name, clone_from=repo_url, use_auth_token=use_auth_token)
+        repo.git_pull()
+        return repo
+
+    @classmethod
+    def _push_to_hub(cls, repo: Repository, commit_message: Optional[str] = None) -> str:
+        if commit_message is None:
+            if "Tokenizer" in cls.__name__:
+                commit_message = "add tokenizer"
+            elif "Config" in cls.__name__:
+                commit_message = "add config"
+            else:
+                commit_message = "add model"
+
+        return repo.push_to_hub(commit_message=commit_message)
+
+
+def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
+    if token is None:
+        token = HfFolder.get_token()
+    if organization is None:
+        username = whoami(token)["name"]
+        return f"{username}/{model_id}"
+    else:
+        return f"{organization}/{model_id}"
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
new file mode 100644
index 0000000000..6207d0df7c
--- /dev/null
+++ b/src/transformers/utils/import_utils.py
@@ -0,0 +1,864 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+
+import importlib.util
+import json
+import os
+import sys
+from collections import OrderedDict
+from functools import wraps
+from itertools import chain
+from types import ModuleType
+from typing import Any
+
+from packaging import version
+
+from transformers.utils.versions import importlib_metadata
+
+from . import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
+
+_torch_version = "N/A"
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    _torch_available = importlib.util.find_spec("torch") is not None
+    if _torch_available:
+        try:
+            _torch_version = importlib_metadata.version("torch")
+            logger.info(f"PyTorch version {_torch_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _torch_available = False
+else:
+    logger.info("Disabling PyTorch because USE_TF is set")
+    _torch_available = False
+
+
+_tf_version = "N/A"
+if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+    _tf_available = importlib.util.find_spec("tensorflow") is not None
+    if _tf_available:
+        candidates = (
+            "tensorflow",
+            "tensorflow-cpu",
+            "tensorflow-gpu",
+            "tf-nightly",
+            "tf-nightly-cpu",
+            "tf-nightly-gpu",
+            "intel-tensorflow",
+            "intel-tensorflow-avx512",
+            "tensorflow-rocm",
+            "tensorflow-macos",
+        )
+        _tf_version = None
+        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+        for pkg in candidates:
+            try:
+                _tf_version = importlib_metadata.version(pkg)
+                break
+            except importlib_metadata.PackageNotFoundError:
+                pass
+        _tf_available = _tf_version is not None
+    if _tf_available:
+        if version.parse(_tf_version) < version.parse("2"):
+            logger.info(f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum.")
+            _tf_available = False
+        else:
+            logger.info(f"TensorFlow version {_tf_version} available.")
+else:
+    logger.info("Disabling Tensorflow because USE_TORCH is set")
+    _tf_available = False
+
+
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
+    if _flax_available:
+        try:
+            _jax_version = importlib_metadata.version("jax")
+            _flax_version = importlib_metadata.version("flax")
+            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _flax_available = False
+else:
+    _flax_available = False
+
+
+_datasets_available = importlib.util.find_spec("datasets") is not None
+try:
+    # Check we're not importing a "datasets" directory somewhere but the actual library by trying to grab the version
+    # AND checking it has an author field in the metadata that is HuggingFace.
+    _ = importlib_metadata.version("datasets")
+    _datasets_metadata = importlib_metadata.metadata("datasets")
+    if _datasets_metadata.get("author", "") != "HuggingFace Inc.":
+        _datasets_available = False
+except importlib_metadata.PackageNotFoundError:
+    _datasets_available = False
+
+
+_detectron2_available = importlib.util.find_spec("detectron2") is not None
+try:
+    _detectron2_version = importlib_metadata.version("detectron2")
+    logger.debug(f"Successfully imported detectron2 version {_detectron2_version}")
+except importlib_metadata.PackageNotFoundError:
+    _detectron2_available = False
+
+
+_faiss_available = importlib.util.find_spec("faiss") is not None
+try:
+    _faiss_version = importlib_metadata.version("faiss")
+    logger.debug(f"Successfully imported faiss version {_faiss_version}")
+except importlib_metadata.PackageNotFoundError:
+    try:
+        _faiss_version = importlib_metadata.version("faiss-cpu")
+        logger.debug(f"Successfully imported faiss version {_faiss_version}")
+    except importlib_metadata.PackageNotFoundError:
+        _faiss_available = False
+
+_ftfy_available = importlib.util.find_spec("ftfy") is not None
+try:
+    _ftfy_version = importlib_metadata.version("ftfy")
+    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
+except importlib_metadata.PackageNotFoundError:
+    _ftfy_available = False
+
+
+coloredlogs = importlib.util.find_spec("coloredlogs") is not None
+try:
+    _coloredlogs_available = importlib_metadata.version("coloredlogs")
+    logger.debug(f"Successfully imported sympy version {_coloredlogs_available}")
+except importlib_metadata.PackageNotFoundError:
+    _coloredlogs_available = False
+
+
+sympy_available = importlib.util.find_spec("sympy") is not None
+try:
+    _sympy_available = importlib_metadata.version("sympy")
+    logger.debug(f"Successfully imported sympy version {_sympy_available}")
+except importlib_metadata.PackageNotFoundError:
+    _sympy_available = False
+
+
+_tf2onnx_available = importlib.util.find_spec("tf2onnx") is not None
+try:
+    _tf2onnx_version = importlib_metadata.version("tf2onnx")
+    logger.debug(f"Successfully imported tf2onnx version {_tf2onnx_version}")
+except importlib_metadata.PackageNotFoundError:
+    _tf2onnx_available = False
+
+_onnx_available = importlib.util.find_spec("onnxruntime") is not None
+try:
+    _onxx_version = importlib_metadata.version("onnx")
+    logger.debug(f"Successfully imported onnx version {_onxx_version}")
+except importlib_metadata.PackageNotFoundError:
+    _onnx_available = False
+
+
+_scatter_available = importlib.util.find_spec("torch_scatter") is not None
+try:
+    _scatter_version = importlib_metadata.version("torch_scatter")
+    logger.debug(f"Successfully imported torch-scatter version {_scatter_version}")
+except importlib_metadata.PackageNotFoundError:
+    _scatter_available = False
+
+
+_pytorch_quantization_available = importlib.util.find_spec("pytorch_quantization") is not None
+try:
+    _pytorch_quantization_version = importlib_metadata.version("pytorch_quantization")
+    logger.debug(f"Successfully imported pytorch-quantization version {_pytorch_quantization_version}")
+except importlib_metadata.PackageNotFoundError:
+    _pytorch_quantization_available = False
+
+
+_soundfile_available = importlib.util.find_spec("soundfile") is not None
+try:
+    _soundfile_version = importlib_metadata.version("soundfile")
+    logger.debug(f"Successfully imported soundfile version {_soundfile_version}")
+except importlib_metadata.PackageNotFoundError:
+    _soundfile_available = False
+
+
+_tensorflow_probability_available = importlib.util.find_spec("tensorflow_probability") is not None
+try:
+    _tensorflow_probability_version = importlib_metadata.version("tensorflow_probability")
+    logger.debug(f"Successfully imported tensorflow-probability version {_tensorflow_probability_version}")
+except importlib_metadata.PackageNotFoundError:
+    _tensorflow_probability_available = False
+
+
+_timm_available = importlib.util.find_spec("timm") is not None
+try:
+    _timm_version = importlib_metadata.version("timm")
+    logger.debug(f"Successfully imported timm version {_timm_version}")
+except importlib_metadata.PackageNotFoundError:
+    _timm_available = False
+
+
+_torchaudio_available = importlib.util.find_spec("torchaudio") is not None
+try:
+    _torchaudio_version = importlib_metadata.version("torchaudio")
+    logger.debug(f"Successfully imported torchaudio version {_torchaudio_version}")
+except importlib_metadata.PackageNotFoundError:
+    _torchaudio_available = False
+
+
+_phonemizer_available = importlib.util.find_spec("phonemizer") is not None
+try:
+    _phonemizer_version = importlib_metadata.version("phonemizer")
+    logger.debug(f"Successfully imported phonemizer version {_phonemizer_version}")
+except importlib_metadata.PackageNotFoundError:
+    _phonemizer_available = False
+
+
+_pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None
+try:
+    _pyctcdecode_version = importlib_metadata.version("pyctcdecode")
+    logger.debug(f"Successfully imported pyctcdecode version {_pyctcdecode_version}")
+except importlib_metadata.PackageNotFoundError:
+    _pyctcdecode_available = False
+
+
+_librosa_available = importlib.util.find_spec("librosa") is not None
+try:
+    _librosa_version = importlib_metadata.version("librosa")
+    logger.debug(f"Successfully imported librosa version {_librosa_version}")
+except importlib_metadata.PackageNotFoundError:
+    _librosa_available = False
+
+
+# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
+TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
+TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
+
+
+def is_torch_available():
+    return _torch_available
+
+
+def is_pyctcdecode_available():
+    return _pyctcdecode_available
+
+
+def is_librosa_available():
+    return _librosa_available
+
+
+def is_torch_cuda_available():
+    if is_torch_available():
+        import torch
+
+        return torch.cuda.is_available()
+    else:
+        return False
+
+
+def is_torch_bf16_available():
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    # since currently no utility function is available we build our own.
+    # some bits come from https://github.com/pytorch/pytorch/blob/2289a12f21c54da93bf5d696e3f9aea83dd9c10d/torch/testing/_internal/common_cuda.py#L51
+    # with additional check for torch version
+    # to succeed:
+    # 1. the hardware needs to support bf16 (arch >= Ampere)
+    # 2. torch >= 1.10 (1.9 should be enough for AMP API has changed in 1.10, so using 1.10 as minimal)
+    # 3. CUDA >= 11
+    # 4. torch.autocast exists
+    # XXX: one problem here is that it may give invalid results on mixed gpus setup, so it's
+    # really only correct for the 0th gpu (or currently set default device if different from 0)
+
+    if not torch.cuda.is_available() or torch.version.cuda is None:
+        return False
+    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
+        return False
+    if int(torch.version.cuda.split(".")[0]) < 11:
+        return False
+    if version.parse(torch.__version__) < version.parse("1.10"):
+        return False
+    if not hasattr(torch, "autocast"):
+        return False
+
+    return True
+
+
+def is_torch_tf32_available():
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if not torch.cuda.is_available() or torch.version.cuda is None:
+        return False
+    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
+        return False
+    if int(torch.version.cuda.split(".")[0]) < 11:
+        return False
+    if version.parse(torch.__version__) < version.parse("1.7"):
+        return False
+
+    return True
+
+
+torch_version = None
+_torch_fx_available = _torch_onnx_dict_inputs_support_available = False
+if _torch_available:
+    torch_version = version.parse(importlib_metadata.version("torch"))
+    _torch_fx_available = (torch_version.major, torch_version.minor) == (
+        TORCH_FX_REQUIRED_VERSION.major,
+        TORCH_FX_REQUIRED_VERSION.minor,
+    )
+
+    _torch_onnx_dict_inputs_support_available = torch_version >= TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION
+
+
+def is_torch_fx_available():
+    return _torch_fx_available
+
+
+def is_torch_onnx_dict_inputs_support_available():
+    return _torch_onnx_dict_inputs_support_available
+
+
+def is_tf_available():
+    return _tf_available
+
+
+def is_coloredlogs_available():
+    return _coloredlogs_available
+
+
+def is_tf2onnx_available():
+    return _tf2onnx_available
+
+
+def is_onnx_available():
+    return _onnx_available
+
+
+def is_flax_available():
+    return _flax_available
+
+
+def is_ftfy_available():
+    return _ftfy_available
+
+
+def is_torch_tpu_available():
+    if not _torch_available:
+        return False
+    # This test is probably enough, but just in case, we unpack a bit.
+    if importlib.util.find_spec("torch_xla") is None:
+        return False
+    if importlib.util.find_spec("torch_xla.core") is None:
+        return False
+    return importlib.util.find_spec("torch_xla.core.xla_model") is not None
+
+
+def is_datasets_available():
+    return _datasets_available
+
+
+def is_detectron2_available():
+    return _detectron2_available
+
+
+def is_rjieba_available():
+    return importlib.util.find_spec("rjieba") is not None
+
+
+def is_psutil_available():
+    return importlib.util.find_spec("psutil") is not None
+
+
+def is_py3nvml_available():
+    return importlib.util.find_spec("py3nvml") is not None
+
+
+def is_apex_available():
+    return importlib.util.find_spec("apex") is not None
+
+
+def is_faiss_available():
+    return _faiss_available
+
+
+def is_scipy_available():
+    return importlib.util.find_spec("scipy") is not None
+
+
+def is_sklearn_available():
+    if importlib.util.find_spec("sklearn") is None:
+        return False
+    return is_scipy_available() and importlib.util.find_spec("sklearn.metrics")
+
+
+def is_sentencepiece_available():
+    return importlib.util.find_spec("sentencepiece") is not None
+
+
+def is_protobuf_available():
+    if importlib.util.find_spec("google") is None:
+        return False
+    return importlib.util.find_spec("google.protobuf") is not None
+
+
+def is_tokenizers_available():
+    return importlib.util.find_spec("tokenizers") is not None
+
+
+def is_vision_available():
+    return importlib.util.find_spec("PIL") is not None
+
+
+def is_pytesseract_available():
+    return importlib.util.find_spec("pytesseract") is not None
+
+
+def is_spacy_available():
+    return importlib.util.find_spec("spacy") is not None
+
+
+def is_in_notebook():
+    try:
+        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
+        get_ipython = sys.modules["IPython"].get_ipython
+        if "IPKernelApp" not in get_ipython().config:
+            raise ImportError("console")
+        if "VSCODE_PID" in os.environ:
+            raise ImportError("vscode")
+
+        return importlib.util.find_spec("IPython") is not None
+    except (AttributeError, ImportError, KeyError):
+        return False
+
+
+def is_scatter_available():
+    return _scatter_available
+
+
+def is_pytorch_quantization_available():
+    return _pytorch_quantization_available
+
+
+def is_tensorflow_probability_available():
+    return _tensorflow_probability_available
+
+
+def is_pandas_available():
+    return importlib.util.find_spec("pandas") is not None
+
+
+def is_sagemaker_dp_enabled():
+    # Get the sagemaker specific env variable.
+    sagemaker_params = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        sagemaker_params = json.loads(sagemaker_params)
+        if not sagemaker_params.get("sagemaker_distributed_dataparallel_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return importlib.util.find_spec("smdistributed") is not None
+
+
+def is_sagemaker_mp_enabled():
+    # Get the sagemaker specific mp parameters from smp_options variable.
+    smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
+    try:
+        # Parse it and check the field "partitions" is included, it is required for model parallel.
+        smp_options = json.loads(smp_options)
+        if "partitions" not in smp_options:
+            return False
+    except json.JSONDecodeError:
+        return False
+
+    # Get the sagemaker specific framework parameters from mpi_options variable.
+    mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        mpi_options = json.loads(mpi_options)
+        if not mpi_options.get("sagemaker_mpi_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return importlib.util.find_spec("smdistributed") is not None
+
+
+def is_training_run_on_sagemaker():
+    return "SAGEMAKER_JOB_NAME" in os.environ
+
+
+def is_soundfile_availble():
+    return _soundfile_available
+
+
+def is_timm_available():
+    return _timm_available
+
+
+def is_torchaudio_available():
+    return _torchaudio_available
+
+
+def is_speech_available():
+    # For now this depends on torchaudio but the exact dependency might evolve in the future.
+    return _torchaudio_available
+
+
+def is_phonemizer_available():
+    return _phonemizer_available
+
+
+def torch_only_method(fn):
+    def wrapper(*args, **kwargs):
+        if not _torch_available:
+            raise ImportError(
+                "You need to install pytorch to use this method or class, "
+                "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
+            )
+        else:
+            return fn(*args, **kwargs)
+
+    return wrapper
+
+
+# docstyle-ignore
+DATASETS_IMPORT_ERROR = """
+{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
+```
+pip install datasets
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install datasets
+```
+then restarting your kernel.
+
+Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
+working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
+that python file if that's the case.
+"""
+
+
+# docstyle-ignore
+TOKENIZERS_IMPORT_ERROR = """
+{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
+```
+pip install tokenizers
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install tokenizers
+```
+"""
+
+
+# docstyle-ignore
+SENTENCEPIECE_IMPORT_ERROR = """
+{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
+that match your environment.
+"""
+
+
+# docstyle-ignore
+PROTOBUF_IMPORT_ERROR = """
+{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
+that match your environment.
+"""
+
+
+# docstyle-ignore
+FAISS_IMPORT_ERROR = """
+{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
+that match your environment.
+"""
+
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR = """
+{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+"""
+
+
+# docstyle-ignore
+SKLEARN_IMPORT_ERROR = """
+{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
+```
+pip install -U scikit-learn
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install -U scikit-learn
+```
+"""
+
+
+# docstyle-ignore
+TENSORFLOW_IMPORT_ERROR = """
+{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
+"""
+
+
+# docstyle-ignore
+DETECTRON2_IMPORT_ERROR = """
+{0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
+that match your environment.
+"""
+
+
+# docstyle-ignore
+FLAX_IMPORT_ERROR = """
+{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/google/flax and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+FTFY_IMPORT_ERROR = """
+{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
+installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
+that match your environment.
+"""
+
+
+# docstyle-ignore
+SCATTER_IMPORT_ERROR = """
+{0} requires the torch-scatter library but it was not found in your environment. You can install it with pip as
+explained here: https://github.com/rusty1s/pytorch_scatter.
+"""
+
+# docstyle-ignore
+PYTORCH_QUANTIZATION_IMPORT_ERROR = """
+{0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
+`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
+"""
+
+# docstyle-ignore
+TENSORFLOW_PROBABILITY_IMPORT_ERROR = """
+{0} requires the tensorflow_probability library but it was not found in your environment. You can install it with pip as
+explained here: https://github.com/tensorflow/probability.
+"""
+
+
+# docstyle-ignore
+PANDAS_IMPORT_ERROR = """
+{0} requires the pandas library but it was not found in your environment. You can install it with pip as
+explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html.
+"""
+
+
+# docstyle-ignore
+PHONEMIZER_IMPORT_ERROR = """
+{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
+`pip install phonemizer`
+"""
+
+
+# docstyle-ignore
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip:
+`pip install scipy`
+"""
+
+
+# docstyle-ignore
+SPEECH_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
+`pip install torchaudio`
+"""
+
+# docstyle-ignore
+TIMM_IMPORT_ERROR = """
+{0} requires the timm library but it was not found in your environment. You can install it with pip:
+`pip install timm`
+"""
+
+# docstyle-ignore
+VISION_IMPORT_ERROR = """
+{0} requires the PIL library but it was not found in your environment. You can install it with pip:
+`pip install pillow`
+"""
+
+
+# docstyle-ignore
+PYTESSERACT_IMPORT_ERROR = """
+{0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
+`pip install pytesseract`
+"""
+
+# docstyle-ignore
+PYCTCDECODE_IMPORT_ERROR = """
+{0} requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
+`pip install pyctcdecode`
+"""
+
+
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
+        ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
+        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
+        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+        ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
+        ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
+        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
+        ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
+        ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
+        ("scatter", (is_scatter_available, SCATTER_IMPORT_ERROR)),
+        ("pytorch_quantization", (is_pytorch_quantization_available, PYTORCH_QUANTIZATION_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
+        ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
+        ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)),
+        ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+        ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
+        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+    ]
+)
+
+
+def requires_backends(obj, backends):
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
+
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+    checks = (BACKENDS_MAPPING[backend] for backend in backends)
+    failed = [msg.format(name) for available, msg in checks if not available()]
+    if failed:
+        raise ImportError("".join(failed))
+
+
+class DummyObject(type):
+    """
+    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
+    `requires_backend` each time a user tries to access any method of that class.
+    """
+
+    def __getattr__(cls, key):
+        if key.startswith("_"):
+            return super().__getattr__(cls, key)
+        requires_backends(cls, cls._backends)
+
+
+def torch_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_torch_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires PyTorch.")
+
+    return wrapper
+
+
+def tf_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_tf_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires TF.")
+
+    return wrapper
+
+
+def is_torch_fx_proxy(x):
+    if is_torch_fx_available():
+        import torch.fx
+
+        return isinstance(x, torch.fx.Proxy)
+    return False
+
+
+class _LazyModule(ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
+        super().__init__(name)
+        self._modules = set(import_structure.keys())
+        self._class_to_module = {}
+        for key, values in import_structure.items():
+            for value in values:
+                self._class_to_module[value] = key
+        # Needed for autocompletion in an IDE
+        self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+        self.__file__ = module_file
+        self.__spec__ = module_spec
+        self.__path__ = [os.path.dirname(module_file)]
+        self._objects = {} if extra_objects is None else extra_objects
+        self._name = name
+        self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = super().__dir__()
+        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._modules:
+            value = self._get_module(name)
+        elif name in self._class_to_module.keys():
+            module = self._get_module(self._class_to_module[name])
+            value = getattr(module, name)
+        else:
+            raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}"
+            ) from e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
index 323c3f2377..afcfeb87eb 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/__init__.py
@@ -18,15 +18,15 @@
 from typing import TYPE_CHECKING
 
 # rely on isort to merge the imports
-from ...file_utils import _LazyModule, is_tokenizers_available
+from ...utils import _LazyModule, is_tokenizers_available
 {%- if "TensorFlow" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-from ...file_utils import is_tf_available
+from ...utils import is_tf_available
 {% endif %}
 {%- if "PyTorch" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-from ...file_utils import is_torch_available
+from ...utils import is_torch_available
 {% endif %}
 {%- if "Flax" in cookiecutter.generate_tensorflow_pytorch_and_flax %}
-from ...file_utils import is_flax_available
+from ...utils import is_flax_available
 {% endif %}
 
 _import_structure = {
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
index 7c33e637c9..500a5ccf89 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_flax_{{cookiecutter.lowercase_modelname}}.py
@@ -27,7 +27,7 @@ from flax.core.frozen_dict import FrozenDict
 from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPooling,
@@ -1136,7 +1136,7 @@ from flax.linen.attention import dot_product_attention_weights
 from jax import lax
 from jax.random import PRNGKey
 
-from ...file_utils import add_start_docstrings, replace_return_docstrings
+from ...utils import add_start_docstrings, replace_return_docstrings
 from ...modeling_flax_outputs import (
     FlaxBaseModelOutput,
     FlaxBaseModelOutputWithPastAndCrossAttentions,
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 25afc22d6c..b0341d157f 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -23,7 +23,7 @@ import numpy as np
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
+from ...utils import (
     DUMMY_INPUTS,
     MULTIPLE_CHOICE_DUMMY_INPUTS,
     add_code_sample_docstrings,
@@ -1782,7 +1782,7 @@ from typing import Optional, Tuple, Union
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
-from ...file_utils import (
+from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
index 0dd72ae8a2..a36c2f068b 100755
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -28,7 +28,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from typing import Optional, Tuple, Union
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -1579,7 +1579,7 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
+from ...utils import (
     add_code_sample_docstrings,
     add_end_docstrings,
     add_start_docstrings,
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
index 1f4826c103..e15adc91e3 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -486,7 +486,7 @@ import tempfile
 import unittest
 
 from transformers import is_torch_available
-from transformers.file_utils import cached_property
+from transformers.utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from ..test_configuration_common import ConfigTester
diff --git a/tests/auto/test_processor_auto.py b/tests/auto/test_processor_auto.py
index 3aa3409ee3..02fa8696c4 100644
--- a/tests/auto/test_processor_auto.py
+++ b/tests/auto/test_processor_auto.py
@@ -36,9 +36,9 @@ from transformers import (
     Wav2Vec2FeatureExtractor,
     Wav2Vec2Processor,
 )
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available
 from transformers.testing_utils import PASS, USER, is_staging_test
 from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
+from transformers.utils import FEATURE_EXTRACTOR_NAME, is_tokenizers_available
 
 
 sys.path.append(str(Path(__file__).parent.parent.parent / "utils"))
diff --git a/tests/bart/test_modeling_bart.py b/tests/bart/test_modeling_bart.py
index 1cee64c790..db443164c5 100644
--- a/tests/bart/test_modeling_bart.py
+++ b/tests/bart/test_modeling_bart.py
@@ -22,8 +22,8 @@ import unittest
 import timeout_decorator  # noqa
 
 from transformers import BartConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/bart/test_modeling_tf_bart.py b/tests/bart/test_modeling_tf_bart.py
index 417f6edcaf..070d16e56b 100644
--- a/tests/bart/test_modeling_tf_bart.py
+++ b/tests/bart/test_modeling_tf_bart.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 
 from transformers import BartConfig, BartTokenizer, is_tf_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_tf, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/bart/test_tokenization_bart.py b/tests/bart/test_tokenization_bart.py
index a4cdff3119..66e5e0b9e3 100644
--- a/tests/bart/test_tokenization_bart.py
+++ b/tests/bart/test_tokenization_bart.py
@@ -16,9 +16,9 @@ import os
 import unittest
 
 from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
-from transformers.file_utils import cached_property
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_tokenizers, require_torch
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
 
diff --git a/tests/beit/test_feature_extraction_beit.py b/tests/beit/test_feature_extraction_beit.py
index aef4de9fbf..71d78a2660 100644
--- a/tests/beit/test_feature_extraction_beit.py
+++ b/tests/beit/test_feature_extraction_beit.py
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 from datasets import load_dataset
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/beit/test_modeling_beit.py b/tests/beit/test_modeling_beit.py
index e7eec83fcd..fb6e7a40d4 100644
--- a/tests/beit/test_modeling_beit.py
+++ b/tests/beit/test_modeling_beit.py
@@ -22,9 +22,9 @@ from datasets import load_dataset
 from packaging import version
 
 from transformers import BeitConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
diff --git a/tests/beit/test_modeling_flax_beit.py b/tests/beit/test_modeling_flax_beit.py
index 12ff97ecab..a1d1fe093b 100644
--- a/tests/beit/test_modeling_flax_beit.py
+++ b/tests/beit/test_modeling_flax_beit.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 
 from transformers import BeitConfig
-from transformers.file_utils import cached_property, is_flax_available, is_vision_available
 from transformers.testing_utils import require_flax, require_vision, slow
+from transformers.utils import cached_property, is_flax_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_flax_common import FlaxModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/bert_generation/test_tokenization_bert_generation.py b/tests/bert_generation/test_tokenization_bert_generation.py
index 20db7c01ca..d21589526b 100644
--- a/tests/bert_generation/test_tokenization_bert_generation.py
+++ b/tests/bert_generation/test_tokenization_bert_generation.py
@@ -18,8 +18,8 @@ import unittest
 from os.path import dirname
 
 from transformers import BertGenerationTokenizer
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_torch, slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/big_bird/test_tokenization_big_bird.py b/tests/big_bird/test_tokenization_big_bird.py
index bbeb009572..d1d0daeda7 100644
--- a/tests/big_bird/test_tokenization_big_bird.py
+++ b/tests/big_bird/test_tokenization_big_bird.py
@@ -18,8 +18,8 @@ import unittest
 from os.path import dirname
 
 from transformers import BigBirdTokenizer, BigBirdTokenizerFast
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/blenderbot/test_modeling_blenderbot.py b/tests/blenderbot/test_modeling_blenderbot.py
index c0b2ef01f2..4a8070bfa7 100644
--- a/tests/blenderbot/test_modeling_blenderbot.py
+++ b/tests/blenderbot/test_modeling_blenderbot.py
@@ -18,8 +18,8 @@ import tempfile
 import unittest
 
 from transformers import BlenderbotConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/blenderbot/test_modeling_tf_blenderbot.py b/tests/blenderbot/test_modeling_tf_blenderbot.py
index 3d0e8fc436..e9e9816b17 100644
--- a/tests/blenderbot/test_modeling_tf_blenderbot.py
+++ b/tests/blenderbot/test_modeling_tf_blenderbot.py
@@ -17,8 +17,8 @@
 import unittest
 
 from transformers import BlenderbotConfig, BlenderbotTokenizer, is_tf_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_tf, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/blenderbot/test_tokenization_blenderbot.py b/tests/blenderbot/test_tokenization_blenderbot.py
index 93f29634ef..3a9c95027b 100644
--- a/tests/blenderbot/test_tokenization_blenderbot.py
+++ b/tests/blenderbot/test_tokenization_blenderbot.py
@@ -17,7 +17,7 @@
 import unittest
 
 from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
-from transformers.file_utils import cached_property
+from transformers.utils import cached_property
 
 
 class Blenderbot3BTokenizerTests(unittest.TestCase):
diff --git a/tests/blenderbot_small/test_modeling_blenderbot_small.py b/tests/blenderbot_small/test_modeling_blenderbot_small.py
index fa6207ef54..7e8ce4b624 100644
--- a/tests/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/blenderbot_small/test_modeling_blenderbot_small.py
@@ -18,8 +18,8 @@ import tempfile
 import unittest
 
 from transformers import BlenderbotSmallConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/blenderbot_small/test_modeling_tf_blenderbot_small.py b/tests/blenderbot_small/test_modeling_tf_blenderbot_small.py
index 6a3eeb826d..33c5b286c9 100644
--- a/tests/blenderbot_small/test_modeling_tf_blenderbot_small.py
+++ b/tests/blenderbot_small/test_modeling_tf_blenderbot_small.py
@@ -17,8 +17,8 @@
 import unittest
 
 from transformers import BlenderbotSmallConfig, BlenderbotSmallTokenizer, is_tf_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_tf, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/byt5/test_tokenization_byt5.py b/tests/byt5/test_tokenization_byt5.py
index f49912e4c5..afdcae0ee3 100644
--- a/tests/byt5/test_tokenization_byt5.py
+++ b/tests/byt5/test_tokenization_byt5.py
@@ -22,7 +22,7 @@ import unittest
 from typing import Tuple
 
 from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
-from transformers.file_utils import cached_property, is_tf_available, is_torch_available
+from transformers.utils import cached_property, is_tf_available, is_torch_available
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/camembert/test_tokenization_camembert.py b/tests/camembert/test_tokenization_camembert.py
index 1f45b21a20..a2274e3b05 100644
--- a/tests/camembert/test_tokenization_camembert.py
+++ b/tests/camembert/test_tokenization_camembert.py
@@ -18,8 +18,8 @@ import unittest
 from os.path import dirname
 
 from transformers import CamembertTokenizer, CamembertTokenizerFast
-from transformers.file_utils import is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+from transformers.utils import is_torch_available
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/canine/test_tokenization_canine.py b/tests/canine/test_tokenization_canine.py
index 2171013280..d894237ff5 100644
--- a/tests/canine/test_tokenization_canine.py
+++ b/tests/canine/test_tokenization_canine.py
@@ -20,9 +20,9 @@ import tempfile
 import unittest
 
 from transformers import BatchEncoding, CanineTokenizer
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_tokenizers, require_torch
 from transformers.tokenization_utils import AddedToken
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/clip/test_feature_extraction_clip.py b/tests/clip/test_feature_extraction_clip.py
index 915600cb30..85cb54510c 100644
--- a/tests/clip/test_feature_extraction_clip.py
+++ b/tests/clip/test_feature_extraction_clip.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin
 
diff --git a/tests/clip/test_modeling_clip.py b/tests/clip/test_modeling_clip.py
index aab17d3f75..09cc6cd61a 100644
--- a/tests/clip/test_modeling_clip.py
+++ b/tests/clip/test_modeling_clip.py
@@ -25,7 +25,6 @@ import numpy as np
 import requests
 import transformers
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import (
     is_flax_available,
     is_pt_flax_cross_test,
@@ -35,6 +34,7 @@ from transformers.testing_utils import (
     slow,
     torch_device,
 )
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import (
diff --git a/tests/clip/test_modeling_tf_clip.py b/tests/clip/test_modeling_tf_clip.py
index 564543dc67..02e289cd5b 100644
--- a/tests/clip/test_modeling_tf_clip.py
+++ b/tests/clip/test_modeling_tf_clip.py
@@ -23,8 +23,8 @@ from importlib import import_module
 
 import requests
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
-from transformers.file_utils import is_tf_available, is_vision_available
 from transformers.testing_utils import is_pt_tf_cross_test, require_tf, require_vision, slow
+from transformers.utils import is_tf_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
diff --git a/tests/clip/test_processor_clip.py b/tests/clip/test_processor_clip.py
index 17520a4ad9..51a0236b90 100644
--- a/tests/clip/test_processor_clip.py
+++ b/tests/clip/test_processor_clip.py
@@ -22,9 +22,9 @@ import numpy as np
 import pytest
 
 from transformers import CLIPTokenizer, CLIPTokenizerFast
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_vision_available
 from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_vision
+from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
 
 
 if is_vision_available():
diff --git a/tests/convnext/test_feature_extraction_convnext.py b/tests/convnext/test_feature_extraction_convnext.py
index 47d8298dbb..8127d468b3 100644
--- a/tests/convnext/test_feature_extraction_convnext.py
+++ b/tests/convnext/test_feature_extraction_convnext.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/convnext/test_modeling_convnext.py b/tests/convnext/test_modeling_convnext.py
index 00f23c23db..ab01d83907 100644
--- a/tests/convnext/test_modeling_convnext.py
+++ b/tests/convnext/test_modeling_convnext.py
@@ -19,8 +19,8 @@ import inspect
 import unittest
 
 from transformers import ConvNextConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/convnext/test_modeling_tf_convnext.py b/tests/convnext/test_modeling_tf_convnext.py
index 880e006f1a..edab09fb69 100644
--- a/tests/convnext/test_modeling_tf_convnext.py
+++ b/tests/convnext/test_modeling_tf_convnext.py
@@ -19,8 +19,8 @@ import unittest
 from typing import List, Tuple
 
 from transformers import ConvNextConfig
-from transformers.file_utils import cached_property, is_tf_available, is_vision_available
 from transformers.testing_utils import require_tf, require_vision, slow
+from transformers.utils import cached_property, is_tf_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 7ff1c395b1..acb72886bb 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -24,7 +24,6 @@ from parameterized import parameterized
 from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import AutoModel, TrainingArguments, is_torch_available, logging
 from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available
-from transformers.file_utils import WEIGHTS_NAME, is_torch_bf16_available
 from transformers.testing_utils import (
     CaptureLogger,
     CaptureStd,
@@ -40,6 +39,7 @@ from transformers.testing_utils import (
     slow,
 )
 from transformers.trainer_utils import get_last_checkpoint, set_seed
+from transformers.utils import WEIGHTS_NAME, is_torch_bf16_available
 
 
 if is_torch_available():
diff --git a/tests/deit/test_feature_extraction_deit.py b/tests/deit/test_feature_extraction_deit.py
index 94cf791389..8eaa831349 100644
--- a/tests/deit/test_feature_extraction_deit.py
+++ b/tests/deit/test_feature_extraction_deit.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/deit/test_modeling_deit.py b/tests/deit/test_modeling_deit.py
index fa89bf231d..7558b71565 100644
--- a/tests/deit/test_modeling_deit.py
+++ b/tests/deit/test_modeling_deit.py
@@ -20,9 +20,9 @@ import unittest
 import warnings
 
 from transformers import DeiTConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/detr/test_feature_extraction_detr.py b/tests/detr/test_feature_extraction_detr.py
index 372ef06a5c..0f88d57c73 100644
--- a/tests/detr/test_feature_extraction_detr.py
+++ b/tests/detr/test_feature_extraction_detr.py
@@ -20,8 +20,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/detr/test_modeling_detr.py b/tests/detr/test_modeling_detr.py
index 50cd1d5bca..45fb370199 100644
--- a/tests/detr/test_modeling_detr.py
+++ b/tests/detr/test_modeling_detr.py
@@ -20,8 +20,8 @@ import math
 import unittest
 
 from transformers import DetrConfig, is_timm_available, is_vision_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 3a65f16580..f13753bdcf 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -20,7 +20,6 @@ import unittest
 from unittest.mock import patch
 
 from parameterized import parameterized
-from transformers.file_utils import is_apex_available
 from transformers.integrations import is_fairscale_available
 from transformers.testing_utils import (
     CaptureStderr,
@@ -37,6 +36,7 @@ from transformers.testing_utils import (
 )
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_utils import set_seed
+from transformers.utils import is_apex_available
 
 
 bindir = os.path.abspath(os.path.dirname(__file__))
diff --git a/tests/fsmt/test_modeling_fsmt.py b/tests/fsmt/test_modeling_fsmt.py
index 3ac2b9e3b2..c452a5729b 100644
--- a/tests/fsmt/test_modeling_fsmt.py
+++ b/tests/fsmt/test_modeling_fsmt.py
@@ -20,8 +20,8 @@ import timeout_decorator  # noqa
 
 from parameterized import parameterized
 from transformers import FSMTConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/fsmt/test_tokenization_fsmt.py b/tests/fsmt/test_tokenization_fsmt.py
index b28df8a589..f5e3d4b6cf 100644
--- a/tests/fsmt/test_tokenization_fsmt.py
+++ b/tests/fsmt/test_tokenization_fsmt.py
@@ -18,9 +18,9 @@ import json
 import os
 import unittest
 
-from transformers.file_utils import cached_property
 from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES, FSMTTokenizer
 from transformers.testing_utils import slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/glpn/test_feature_extraction_glpn.py b/tests/glpn/test_feature_extraction_glpn.py
index 1b21282e64..c903491ce1 100644
--- a/tests/glpn/test_feature_extraction_glpn.py
+++ b/tests/glpn/test_feature_extraction_glpn.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/gpt_neo/test_modeling_gpt_neo.py b/tests/gpt_neo/test_modeling_gpt_neo.py
index ffb6352baf..4135d7857c 100644
--- a/tests/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/gpt_neo/test_modeling_gpt_neo.py
@@ -18,8 +18,8 @@
 import unittest
 
 from transformers import GPTNeoConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/imagegpt/test_feature_extraction_imagegpt.py b/tests/imagegpt/test_feature_extraction_imagegpt.py
index ea5859ecc9..dd0fdfa89a 100644
--- a/tests/imagegpt/test_feature_extraction_imagegpt.py
+++ b/tests/imagegpt/test_feature_extraction_imagegpt.py
@@ -22,8 +22,8 @@ import unittest
 import numpy as np
 from datasets import load_dataset
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin
 
diff --git a/tests/imagegpt/test_modeling_imagegpt.py b/tests/imagegpt/test_modeling_imagegpt.py
index 597ccd4137..c570e61927 100644
--- a/tests/imagegpt/test_modeling_imagegpt.py
+++ b/tests/imagegpt/test_modeling_imagegpt.py
@@ -21,8 +21,8 @@ import tempfile
 import unittest
 
 from transformers import ImageGPTConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/layoutlmv2/test_feature_extraction_layoutlmv2.py b/tests/layoutlmv2/test_feature_extraction_layoutlmv2.py
index e849a752a9..ca3bbf1cdc 100644
--- a/tests/layoutlmv2/test_feature_extraction_layoutlmv2.py
+++ b/tests/layoutlmv2/test_feature_extraction_layoutlmv2.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_pytesseract_available, is_torch_available
 from transformers.testing_utils import require_pytesseract, require_torch
+from transformers.utils import is_pytesseract_available, is_torch_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/layoutlmv2/test_modeling_layoutlmv2.py b/tests/layoutlmv2/test_modeling_layoutlmv2.py
index fbf96c2031..708a433989 100644
--- a/tests/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/layoutlmv2/test_modeling_layoutlmv2.py
@@ -20,8 +20,8 @@ import random
 import tempfile
 import unittest
 
-from transformers.file_utils import is_detectron2_available, is_torch_available
 from transformers.testing_utils import require_detectron2, require_torch, slow, torch_device
+from transformers.utils import is_detectron2_available, is_torch_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
diff --git a/tests/layoutlmv2/test_processor_layoutlmv2.py b/tests/layoutlmv2/test_processor_layoutlmv2.py
index 0098713036..e822d177ca 100644
--- a/tests/layoutlmv2/test_processor_layoutlmv2.py
+++ b/tests/layoutlmv2/test_processor_layoutlmv2.py
@@ -20,10 +20,10 @@ import unittest
 from typing import List
 
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
 from transformers.models.layoutlmv2 import LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast
 from transformers.models.layoutlmv2.tokenization_layoutlmv2 import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_pytesseract, require_tokenizers, require_torch, slow
+from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
 
 
 if is_pytesseract_available():
diff --git a/tests/layoutxlm/test_processor_layoutxlm.py b/tests/layoutxlm/test_processor_layoutxlm.py
index 82f2daa1df..3964765e32 100644
--- a/tests/layoutxlm/test_processor_layoutxlm.py
+++ b/tests/layoutxlm/test_processor_layoutxlm.py
@@ -21,7 +21,6 @@ from os.path import dirname
 from typing import List
 
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
 from transformers.models.layoutxlm import LayoutXLMTokenizer, LayoutXLMTokenizerFast
 from transformers.testing_utils import (
     require_pytesseract,
@@ -30,6 +29,7 @@ from transformers.testing_utils import (
     require_torch,
     slow,
 )
+from transformers.utils import FEATURE_EXTRACTOR_NAME, cached_property, is_pytesseract_available
 
 
 if is_pytesseract_available():
diff --git a/tests/led/test_modeling_led.py b/tests/led/test_modeling_led.py
index 5312dfc7a4..758834d7fe 100644
--- a/tests/led/test_modeling_led.py
+++ b/tests/led/test_modeling_led.py
@@ -20,9 +20,9 @@ import tempfile
 import unittest
 
 from transformers import LEDConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/m2m_100/test_modeling_m2m_100.py b/tests/m2m_100/test_modeling_m2m_100.py
index 3393764429..52a677618a 100644
--- a/tests/m2m_100/test_modeling_m2m_100.py
+++ b/tests/m2m_100/test_modeling_m2m_100.py
@@ -20,8 +20,8 @@ import tempfile
 import unittest
 
 from transformers import M2M100Config, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/m2m_100/test_tokenization_m2m_100.py b/tests/m2m_100/test_tokenization_m2m_100.py
index 55c2ec11d8..35652f0cb3 100644
--- a/tests/m2m_100/test_tokenization_m2m_100.py
+++ b/tests/m2m_100/test_tokenization_m2m_100.py
@@ -20,8 +20,8 @@ from pathlib import Path
 from shutil import copyfile
 
 from transformers import M2M100Tokenizer, is_torch_available
-from transformers.file_utils import is_sentencepiece_available
 from transformers.testing_utils import nested_simplify, require_sentencepiece, require_tokenizers, require_torch, slow
+from transformers.utils import is_sentencepiece_available
 
 
 if is_sentencepiece_available():
diff --git a/tests/marian/test_modeling_flax_marian.py b/tests/marian/test_modeling_flax_marian.py
index ecd1906f38..bfb0d273ad 100644
--- a/tests/marian/test_modeling_flax_marian.py
+++ b/tests/marian/test_modeling_flax_marian.py
@@ -18,8 +18,8 @@ import numpy as np
 import timeout_decorator  # noqa
 
 from transformers import MarianConfig, is_flax_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..generation.test_generation_flax_utils import FlaxGenerationTesterMixin
 from ..test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
diff --git a/tests/marian/test_modeling_marian.py b/tests/marian/test_modeling_marian.py
index c2382dd132..067144d8cd 100644
--- a/tests/marian/test_modeling_marian.py
+++ b/tests/marian/test_modeling_marian.py
@@ -19,8 +19,8 @@ import unittest
 
 from huggingface_hub.hf_api import list_models
 from transformers import MarianConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/marian/test_modeling_tf_marian.py b/tests/marian/test_modeling_tf_marian.py
index 23bd9be1fc..eb4f24700b 100644
--- a/tests/marian/test_modeling_tf_marian.py
+++ b/tests/marian/test_modeling_tf_marian.py
@@ -19,8 +19,8 @@ import unittest
 import warnings
 
 from transformers import AutoTokenizer, MarianConfig, MarianTokenizer, TranslationPipeline, is_tf_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/marian/test_tokenization_marian.py b/tests/marian/test_tokenization_marian.py
index 0bc3c3d214..6b6ee6c966 100644
--- a/tests/marian/test_tokenization_marian.py
+++ b/tests/marian/test_tokenization_marian.py
@@ -21,8 +21,8 @@ from pathlib import Path
 from shutil import copyfile
 
 from transformers import BatchEncoding, MarianTokenizer
-from transformers.file_utils import is_sentencepiece_available, is_tf_available, is_torch_available
 from transformers.testing_utils import require_sentencepiece, slow
+from transformers.utils import is_sentencepiece_available, is_tf_available, is_torch_available
 
 
 if is_sentencepiece_available():
diff --git a/tests/maskformer/test_feature_extraction_maskformer.py b/tests/maskformer/test_feature_extraction_maskformer.py
index ad4b5d6b0c..c44899ebf5 100644
--- a/tests/maskformer/test_feature_extraction_maskformer.py
+++ b/tests/maskformer/test_feature_extraction_maskformer.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/maskformer/test_modeling_maskformer.py b/tests/maskformer/test_modeling_maskformer.py
index 4f8d47af85..a27fe41a69 100644
--- a/tests/maskformer/test_modeling_maskformer.py
+++ b/tests/maskformer/test_modeling_maskformer.py
@@ -21,8 +21,8 @@ import numpy as np
 
 from tests.test_modeling_common import floats_tensor
 from transformers import DetrConfig, MaskFormerConfig, SwinConfig, is_torch_available, is_vision_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin
diff --git a/tests/mbart/test_modeling_flax_mbart.py b/tests/mbart/test_modeling_flax_mbart.py
index 64e062784d..bd235487d7 100644
--- a/tests/mbart/test_modeling_flax_mbart.py
+++ b/tests/mbart/test_modeling_flax_mbart.py
@@ -18,8 +18,8 @@ import numpy as np
 import timeout_decorator  # noqa
 
 from transformers import MBartConfig, is_flax_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_flax, require_sentencepiece, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..generation.test_generation_flax_utils import FlaxGenerationTesterMixin
 from ..test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
diff --git a/tests/mbart/test_modeling_mbart.py b/tests/mbart/test_modeling_mbart.py
index fb297457d9..2037ee79ef 100644
--- a/tests/mbart/test_modeling_mbart.py
+++ b/tests/mbart/test_modeling_mbart.py
@@ -20,8 +20,8 @@ import tempfile
 import unittest
 
 from transformers import MBartConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/mbart/test_modeling_tf_mbart.py b/tests/mbart/test_modeling_tf_mbart.py
index ae1cfe040d..eec59fe1b5 100644
--- a/tests/mbart/test_modeling_tf_mbart.py
+++ b/tests/mbart/test_modeling_tf_mbart.py
@@ -17,8 +17,8 @@ import tempfile
 import unittest
 
 from transformers import AutoTokenizer, MBartConfig, is_tf_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index e6076dd076..aea67d920d 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -242,7 +242,7 @@ class OnnxExportTestCaseV2(TestCase):
         onnx_config = onnx_config_class_constructor(model.config)
 
         if is_torch_available():
-            from transformers.file_utils import torch_version
+            from transformers.utils import torch_version
 
             if torch_version < onnx_config.torch_onnx_minimum_version:
                 pytest.skip(
diff --git a/tests/pegasus/test_modeling_pegasus.py b/tests/pegasus/test_modeling_pegasus.py
index 3f8cdcca10..090300de0c 100644
--- a/tests/pegasus/test_modeling_pegasus.py
+++ b/tests/pegasus/test_modeling_pegasus.py
@@ -18,8 +18,8 @@ import tempfile
 import unittest
 
 from transformers import PegasusConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..mbart.test_modeling_mbart import AbstractSeq2SeqIntegrationTest
diff --git a/tests/pegasus/test_modeling_tf_pegasus.py b/tests/pegasus/test_modeling_tf_pegasus.py
index ca0d525267..dd2b8c7d61 100644
--- a/tests/pegasus/test_modeling_tf_pegasus.py
+++ b/tests/pegasus/test_modeling_tf_pegasus.py
@@ -17,8 +17,8 @@ import tempfile
 import unittest
 
 from transformers import AutoTokenizer, PegasusConfig, is_tf_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/pegasus/test_tokenization_pegasus.py b/tests/pegasus/test_tokenization_pegasus.py
index c189ffdabb..7634902b58 100644
--- a/tests/pegasus/test_tokenization_pegasus.py
+++ b/tests/pegasus/test_tokenization_pegasus.py
@@ -15,8 +15,8 @@
 import unittest
 
 from transformers import PegasusTokenizer, PegasusTokenizerFast
-from transformers.file_utils import cached_property
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/perceiver/test_modeling_perceiver.py b/tests/perceiver/test_modeling_perceiver.py
index 4d50e41e87..5fd75ab649 100644
--- a/tests/perceiver/test_modeling_perceiver.py
+++ b/tests/perceiver/test_modeling_perceiver.py
@@ -26,9 +26,9 @@ import numpy as np
 from datasets import load_dataset
 
 from transformers import PerceiverConfig
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
diff --git a/tests/perceiver/test_tokenization_perceiver.py b/tests/perceiver/test_tokenization_perceiver.py
index f92add4885..214e6aff38 100644
--- a/tests/perceiver/test_tokenization_perceiver.py
+++ b/tests/perceiver/test_tokenization_perceiver.py
@@ -22,7 +22,7 @@ import unittest
 from typing import Tuple
 
 from transformers import AddedToken, BatchEncoding, PerceiverTokenizer
-from transformers.file_utils import cached_property, is_tf_available, is_torch_available
+from transformers.utils import cached_property, is_tf_available, is_torch_available
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/plbart/test_modeling_plbart.py b/tests/plbart/test_modeling_plbart.py
index 3353b09b5a..6a307e244e 100644
--- a/tests/plbart/test_modeling_plbart.py
+++ b/tests/plbart/test_modeling_plbart.py
@@ -20,8 +20,8 @@ import tempfile
 import unittest
 
 from transformers import PLBartConfig, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/poolformer/test_feature_extraction_poolformer.py b/tests/poolformer/test_feature_extraction_poolformer.py
index 5c61642830..5fd830a7ff 100644
--- a/tests/poolformer/test_feature_extraction_poolformer.py
+++ b/tests/poolformer/test_feature_extraction_poolformer.py
@@ -17,8 +17,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/rag/test_modeling_rag.py b/tests/rag/test_modeling_rag.py
index ecd431a5a1..a7d1288f51 100644
--- a/tests/rag/test_modeling_rag.py
+++ b/tests/rag/test_modeling_rag.py
@@ -25,7 +25,6 @@ from unittest.mock import patch
 import numpy as np
 
 from transformers import BartTokenizer, T5Tokenizer
-from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_torch_available
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
 from transformers.models.dpr.tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
@@ -37,6 +36,7 @@ from transformers.testing_utils import (
     slow,
     torch_device,
 )
+from transformers.utils import cached_property, is_datasets_available, is_faiss_available, is_torch_available
 
 from ..bart.test_modeling_bart import BartModelTester
 from ..dpr.test_modeling_dpr import DPRModelTester
diff --git a/tests/rag/test_modeling_tf_rag.py b/tests/rag/test_modeling_tf_rag.py
index 7f201e7638..d9050acb63 100644
--- a/tests/rag/test_modeling_tf_rag.py
+++ b/tests/rag/test_modeling_tf_rag.py
@@ -8,11 +8,11 @@ from unittest.mock import patch
 import numpy as np
 
 from transformers import BartTokenizer
-from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_tf_available
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
 from transformers.models.dpr.tokenization_dpr import DPRQuestionEncoderTokenizer
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
 from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+from transformers.utils import cached_property, is_datasets_available, is_faiss_available, is_tf_available
 
 
 if is_tf_available() and is_datasets_available() and is_faiss_available():
diff --git a/tests/rag/test_tokenization_rag.py b/tests/rag/test_tokenization_rag.py
index 3e19a9636b..ae99092484 100644
--- a/tests/rag/test_tokenization_rag.py
+++ b/tests/rag/test_tokenization_rag.py
@@ -19,12 +19,12 @@ import tempfile
 from unittest import TestCase
 
 from transformers import BartTokenizer, BartTokenizerFast, DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast
-from transformers.file_utils import is_datasets_available, is_faiss_available, is_torch_available
 from transformers.models.bart.configuration_bart import BartConfig
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
 from transformers.models.dpr.configuration_dpr import DPRConfig
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
 from transformers.testing_utils import require_faiss, require_tokenizers, require_torch, slow
+from transformers.utils import is_datasets_available, is_faiss_available, is_torch_available
 
 
 if is_torch_available() and is_datasets_available() and is_faiss_available():
diff --git a/tests/reformer/test_tokenization_reformer.py b/tests/reformer/test_tokenization_reformer.py
index 3ca818d1cd..22e6e455e6 100644
--- a/tests/reformer/test_tokenization_reformer.py
+++ b/tests/reformer/test_tokenization_reformer.py
@@ -17,8 +17,8 @@ import unittest
 from os.path import dirname
 
 from transformers import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/resnet/test_modeling_resnet.py b/tests/resnet/test_modeling_resnet.py
index 85367155f2..5880caa5e4 100644
--- a/tests/resnet/test_modeling_resnet.py
+++ b/tests/resnet/test_modeling_resnet.py
@@ -19,8 +19,8 @@ import inspect
 import unittest
 
 from transformers import ResNetConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/segformer/test_feature_extraction_segformer.py b/tests/segformer/test_feature_extraction_segformer.py
index 6aa29e935d..c34cb6fc0d 100644
--- a/tests/segformer/test_feature_extraction_segformer.py
+++ b/tests/segformer/test_feature_extraction_segformer.py
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 from datasets import load_dataset
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/speech_to_text/test_modeling_speech_to_text.py b/tests/speech_to_text/test_modeling_speech_to_text.py
index 24b03177fb..03f2d81166 100644
--- a/tests/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/speech_to_text/test_modeling_speech_to_text.py
@@ -21,7 +21,6 @@ import tempfile
 import unittest
 
 from transformers import Speech2TextConfig
-from transformers.file_utils import cached_property
 from transformers.testing_utils import (
     is_torch_available,
     require_sentencepiece,
@@ -31,6 +30,7 @@ from transformers.testing_utils import (
     slow,
     torch_device,
 )
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/speech_to_text/test_modeling_tf_speech_to_text.py
index 0bbb0fe4ae..d8d538fe19 100644
--- a/tests/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -18,8 +18,8 @@ import inspect
 import unittest
 
 from transformers import Speech2TextConfig
-from transformers.file_utils import cached_property, is_tf_available
 from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+from transformers.utils import cached_property, is_tf_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/speech_to_text/test_processor_speech_to_text.py b/tests/speech_to_text/test_processor_speech_to_text.py
index 93275d3a56..05871a2bb0 100644
--- a/tests/speech_to_text/test_processor_speech_to_text.py
+++ b/tests/speech_to_text/test_processor_speech_to_text.py
@@ -21,9 +21,9 @@ from pathlib import Path
 from shutil import copyfile
 
 from transformers import Speech2TextTokenizer, is_speech_available
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME
 from transformers.models.speech_to_text.tokenization_speech_to_text import VOCAB_FILES_NAMES, save_json
 from transformers.testing_utils import require_sentencepiece, require_torch, require_torchaudio
+from transformers.utils import FEATURE_EXTRACTOR_NAME
 
 from .test_feature_extraction_speech_to_text import floats_list
 
diff --git a/tests/swin/test_modeling_swin.py b/tests/swin/test_modeling_swin.py
index 373e812d5f..414335ea4e 100644
--- a/tests/swin/test_modeling_swin.py
+++ b/tests/swin/test_modeling_swin.py
@@ -19,8 +19,8 @@ import inspect
 import unittest
 
 from transformers import SwinConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/t5/test_modeling_t5.py b/tests/t5/test_modeling_t5.py
index c48a0efa4c..e4bdf8ad30 100644
--- a/tests/t5/test_modeling_t5.py
+++ b/tests/t5/test_modeling_t5.py
@@ -19,8 +19,8 @@ import tempfile
 import unittest
 
 from transformers import T5Config, is_torch_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..generation.test_generation_utils import GenerationTesterMixin
 from ..test_configuration_common import ConfigTester
diff --git a/tests/t5/test_modeling_tf_t5.py b/tests/t5/test_modeling_tf_t5.py
index f6cead8150..a2ea255fac 100644
--- a/tests/t5/test_modeling_tf_t5.py
+++ b/tests/t5/test_modeling_tf_t5.py
@@ -16,8 +16,8 @@
 import unittest
 
 from transformers import T5Config, is_tf_available
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/t5/test_tokenization_t5.py b/tests/t5/test_tokenization_t5.py
index 2683e5cbf1..2deaa21f3a 100644
--- a/tests/t5/test_tokenization_t5.py
+++ b/tests/t5/test_tokenization_t5.py
@@ -18,8 +18,8 @@ import tempfile
 import unittest
 
 from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
-from transformers.file_utils import cached_property, is_tf_available, is_torch_available
 from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.utils import cached_property, is_tf_available, is_torch_available
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/tapas/test_modeling_tapas.py b/tests/tapas/test_modeling_tapas.py
index 17f9a9096a..84fb0b4b3e 100644
--- a/tests/tapas/test_modeling_tapas.py
+++ b/tests/tapas/test_modeling_tapas.py
@@ -31,9 +31,9 @@ from transformers import (
     TapasConfig,
     is_torch_available,
 )
-from transformers.file_utils import cached_property
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_scatter, require_torch, slow, torch_device
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
diff --git a/tests/tapas/test_modeling_tf_tapas.py b/tests/tapas/test_modeling_tf_tapas.py
index 9481c24857..936273a6ca 100644
--- a/tests/tapas/test_modeling_tf_tapas.py
+++ b/tests/tapas/test_modeling_tf_tapas.py
@@ -33,9 +33,9 @@ from transformers import (
     TapasTokenizer,
     is_tf_available,
 )
-from transformers.file_utils import cached_property
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_tensorflow_probability, require_tf, slow
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index 098d982b14..3551d34d61 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -24,8 +24,8 @@ from pathlib import Path
 from huggingface_hub import Repository, delete_repo, login
 from requests.exceptions import HTTPError
 from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import PASS, USER, is_staging_test
+from transformers.utils import is_torch_available, is_vision_available
 
 
 sys.path.append(str(Path(__file__).parent.parent / "utils"))
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index d3e00d7fbf..7cc7114e79 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -40,7 +40,6 @@ from transformers import (
     is_torch_available,
     logging,
 )
-from transformers.file_utils import WEIGHTS_NAME, is_flax_available, is_torch_fx_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
     PASS,
@@ -55,6 +54,7 @@ from transformers.testing_utils import (
     slow,
     torch_device,
 )
+from transformers.utils import WEIGHTS_NAME, is_flax_available, is_torch_fx_available
 
 
 sys.path.append(str(Path(__file__).parent.parent / "utils"))
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index a1a8a88c70..ec044ea1da 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -39,7 +39,6 @@ from transformers import (
     is_torch_available,
     logging,
 )
-from transformers.file_utils import WEIGHTS_NAME, is_apex_available
 from transformers.testing_utils import (
     ENDPOINT_STAGING,
     PASS,
@@ -66,6 +65,7 @@ from transformers.testing_utils import (
 )
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 from transformers.training_args import OptimizerNames
+from transformers.utils import WEIGHTS_NAME, is_apex_available
 from transformers.utils.hp_naming import TrialShortNamer
 
 
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index dc4a11260d..6212dea1f7 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -14,8 +14,8 @@
 # limitations under the License.
 
 from transformers import BertTokenizer, EncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments
-from transformers.file_utils import is_datasets_available
 from transformers.testing_utils import TestCasePlus, require_torch, slow
+from transformers.utils import is_datasets_available
 
 
 if is_datasets_available():
diff --git a/tests/trainer/test_trainer_utils.py b/tests/trainer/test_trainer_utils.py
index 8fe8d2e1d2..db060641cb 100644
--- a/tests/trainer/test_trainer_utils.py
+++ b/tests/trainer/test_trainer_utils.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available
 from transformers.testing_utils import require_torch
+from transformers.utils import is_torch_available
 
 
 if is_torch_available():
diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py
index edd84dfe98..3d88552a6c 100644
--- a/tests/utils/test_add_new_model_like.py
+++ b/tests/utils/test_add_new_model_like.py
@@ -874,7 +874,7 @@ NEW_BERT_CONSTANT = "value"
         test_init = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
 
 _import_structure = {
     "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
@@ -918,7 +918,7 @@ else:
         init_no_tokenizer = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 _import_structure = {
     "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
@@ -954,7 +954,7 @@ else:
         init_pt_only = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+from ...utils import _LazyModule, is_tokenizers_available, is_torch_available
 
 _import_structure = {
     "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
@@ -986,7 +986,7 @@ else:
         init_pt_only_no_tokenizer = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 _import_structure = {
     "configuration_gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2OnnxConfig"],
@@ -1026,7 +1026,7 @@ else:
         test_init = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vision_available
 
 _import_structure = {
     "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
@@ -1068,7 +1068,7 @@ else:
         init_no_feature_extractor = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
+from ...utils import _LazyModule, is_flax_available, is_tf_available, is_torch_available
 
 _import_structure = {
     "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
@@ -1104,7 +1104,7 @@ else:
         init_pt_only = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available, is_vision_available
+from ...utils import _LazyModule, is_torch_available, is_vision_available
 
 _import_structure = {
     "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
@@ -1134,7 +1134,7 @@ else:
         init_pt_only_no_feature_extractor = """
 from typing import TYPE_CHECKING
 
-from ...file_utils import _LazyModule, is_torch_available
+from ...utils import _LazyModule, is_torch_available
 
 _import_structure = {
     "configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py
index 768dda263d..decc7fd17c 100644
--- a/tests/utils/test_file_utils.py
+++ b/tests/utils/test_file_utils.py
@@ -24,7 +24,8 @@ import transformers
 
 # Try to import everything from transformers to ensure every object can be loaded.
 from transformers import *  # noqa F406
-from transformers.file_utils import (
+from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
+from transformers.utils import (
     CONFIG_NAME,
     FLAX_WEIGHTS_NAME,
     TF2_WEIGHTS_NAME,
@@ -39,7 +40,6 @@ from transformers.file_utils import (
     has_file,
     hf_bucket_url,
 )
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER
 
 
 MODEL_ID = DUMMY_UNKNOWN_IDENTIFIER
diff --git a/tests/utils/test_model_output.py b/tests/utils/test_model_output.py
index 381f9760a5..9fe3e32a99 100644
--- a/tests/utils/test_model_output.py
+++ b/tests/utils/test_model_output.py
@@ -17,7 +17,7 @@ import unittest
 from dataclasses import dataclass
 from typing import Optional
 
-from transformers.file_utils import ModelOutput
+from transformers.utils import ModelOutput
 
 
 @dataclass
diff --git a/tests/van/test_modeling_van.py b/tests/van/test_modeling_van.py
index 2b550fd325..fe21c83c20 100644
--- a/tests/van/test_modeling_van.py
+++ b/tests/van/test_modeling_van.py
@@ -20,8 +20,8 @@ import math
 import unittest
 
 from transformers import VanConfig
-from transformers.file_utils import cached_property, is_scipy_available, is_torch_available, is_vision_available
 from transformers.testing_utils import require_scipy, require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_scipy_available, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
diff --git a/tests/vilt/test_feature_extraction_vilt.py b/tests/vilt/test_feature_extraction_vilt.py
index ed22ebbb07..7c82e63eaf 100644
--- a/tests/vilt/test_feature_extraction_vilt.py
+++ b/tests/vilt/test_feature_extraction_vilt.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/vilt/test_modeling_vilt.py b/tests/vilt/test_modeling_vilt.py
index 354b694c66..2ddf9c3455 100644
--- a/tests/vilt/test_modeling_vilt.py
+++ b/tests/vilt/test_modeling_vilt.py
@@ -20,9 +20,9 @@ from datasets import load_dataset
 from packaging import version
 
 from transformers import ViltConfig, is_torch_available, is_vision_available
-from transformers.file_utils import cached_property
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
diff --git a/tests/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 39967a271b..90c61ab185 100644
--- a/tests/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -20,8 +20,8 @@ import unittest
 from datasets import load_dataset
 from packaging import version
 
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..bart.test_modeling_bart import BartModelTester
 from ..bert.test_modeling_bert import BertModelTester
diff --git a/tests/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
index f176678670..27893a3d3b 100644
--- a/tests/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ b/tests/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
@@ -21,7 +21,6 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_flax_available, is_torch_available, is_vision_available
 from transformers.testing_utils import (
     is_pt_flax_cross_test,
     require_flax,
@@ -30,6 +29,7 @@ from transformers.testing_utils import (
     slow,
     torch_device,
 )
+from transformers.utils import is_flax_available, is_torch_available, is_vision_available
 
 from ..bert.test_modeling_flax_bert import FlaxBertModelTester
 from ..clip.test_modeling_flax_clip import FlaxCLIPVisionModelTester
diff --git a/tests/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
index a84e4d6171..a5fd8eb911 100644
--- a/tests/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -21,8 +21,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_flax_available, is_torch_available, is_vision_available
 from transformers.testing_utils import is_pt_flax_cross_test, require_torch, require_vision, slow, torch_device
+from transformers.utils import is_flax_available, is_torch_available, is_vision_available
 
 from ..bert.test_modeling_bert import BertModelTester
 from ..clip.test_modeling_clip import CLIPVisionModelTester
diff --git a/tests/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py b/tests/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
index ed23e1659a..fa06181bdf 100644
--- a/tests/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
+++ b/tests/vision_text_dual_encoder/test_processor_vision_text_dual_encoder.py
@@ -21,9 +21,9 @@ import unittest
 import numpy as np
 
 from transformers import BertTokenizerFast
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_vision_available
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES, BertTokenizer
 from transformers.testing_utils import require_tokenizers, require_vision
+from transformers.utils import FEATURE_EXTRACTOR_NAME, is_vision_available
 
 
 if is_vision_available():
diff --git a/tests/vit/test_feature_extraction_vit.py b/tests/vit/test_feature_extraction_vit.py
index 67135d4d2e..df722d74bb 100644
--- a/tests/vit/test_feature_extraction_vit.py
+++ b/tests/vit/test_feature_extraction_vit.py
@@ -18,8 +18,8 @@ import unittest
 
 import numpy as np
 
-from transformers.file_utils import is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
 
 from ..test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_image_inputs
 
diff --git a/tests/vit/test_modeling_tf_vit.py b/tests/vit/test_modeling_tf_vit.py
index 74408fea36..f40580d733 100644
--- a/tests/vit/test_modeling_tf_vit.py
+++ b/tests/vit/test_modeling_tf_vit.py
@@ -21,8 +21,8 @@ import tempfile
 import unittest
 
 from transformers import ViTConfig
-from transformers.file_utils import cached_property, is_tf_available, is_vision_available
 from transformers.testing_utils import require_tf, require_vision, slow, tooslow
+from transformers.utils import cached_property, is_tf_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/vit/test_modeling_vit.py b/tests/vit/test_modeling_vit.py
index f6e0a314f2..a6f167b48b 100644
--- a/tests/vit/test_modeling_vit.py
+++ b/tests/vit/test_modeling_vit.py
@@ -19,8 +19,8 @@ import inspect
 import unittest
 
 from transformers import ViTConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/vit_mae/test_modeling_vit_mae.py b/tests/vit_mae/test_modeling_vit_mae.py
index 6a218b5022..5efb22c3c3 100644
--- a/tests/vit_mae/test_modeling_vit_mae.py
+++ b/tests/vit_mae/test_modeling_vit_mae.py
@@ -23,8 +23,8 @@ import unittest
 import numpy as np
 
 from transformers import ViTMAEConfig
-from transformers.file_utils import cached_property, is_torch_available, is_vision_available
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
diff --git a/tests/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/wav2vec2/test_modeling_tf_wav2vec2.py
index 110e0717b4..6da315e789 100644
--- a/tests/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -26,8 +26,8 @@ from datasets import load_dataset
 
 from huggingface_hub import snapshot_download
 from transformers import Wav2Vec2Config, is_tf_available
-from transformers.file_utils import is_librosa_available, is_pyctcdecode_available
 from transformers.testing_utils import require_librosa, require_pyctcdecode, require_tf, slow
+from transformers.utils import is_librosa_available, is_pyctcdecode_available
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_tf_common import TFModelTesterMixin, ids_tensor
diff --git a/tests/wav2vec2/test_processor_wav2vec2.py b/tests/wav2vec2/test_processor_wav2vec2.py
index 924d64bee9..8b7188f8eb 100644
--- a/tests/wav2vec2/test_processor_wav2vec2.py
+++ b/tests/wav2vec2/test_processor_wav2vec2.py
@@ -18,9 +18,9 @@ import shutil
 import tempfile
 import unittest
 
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
+from transformers.utils import FEATURE_EXTRACTOR_NAME
 
 from .test_feature_extraction_wav2vec2 import floats_list
 
diff --git a/tests/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index ae1159dc9b..c2a19cc39a 100644
--- a/tests/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -25,10 +25,10 @@ import numpy as np
 from datasets import load_dataset
 
 from transformers import AutoProcessor
-from transformers.file_utils import FEATURE_EXTRACTOR_NAME, is_pyctcdecode_available, is_torch_available
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_pyctcdecode, require_torch, require_torchaudio, slow
+from transformers.utils import FEATURE_EXTRACTOR_NAME, is_pyctcdecode_available, is_torch_available
 
 from ..wav2vec2.test_feature_extraction_wav2vec2 import floats_list
 
diff --git a/tests/xglm/test_tokenization_xglm.py b/tests/xglm/test_tokenization_xglm.py
index a86de6a2ff..f7b2708582 100644
--- a/tests/xglm/test_tokenization_xglm.py
+++ b/tests/xglm/test_tokenization_xglm.py
@@ -20,8 +20,8 @@ import tempfile
 import unittest
 
 from transformers import SPIECE_UNDERLINE, XGLMTokenizer, XGLMTokenizerFast
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/xlm_prophetnet/test_tokenization_xlm_prophetnet.py b/tests/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
index 317fde014c..c8f7568763 100644
--- a/tests/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
+++ b/tests/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
@@ -17,9 +17,9 @@ import os
 import unittest
 from os.path import dirname
 
-from transformers.file_utils import cached_property
 from transformers.models.xlm_prophetnet.tokenization_xlm_prophetnet import SPIECE_UNDERLINE, XLMProphetNetTokenizer
 from transformers.testing_utils import require_sentencepiece, slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/xlm_roberta/test_tokenization_xlm_roberta.py
index 5266a2a28a..0ba1492efc 100644
--- a/tests/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -21,8 +21,8 @@ import unittest
 from os.path import dirname
 
 from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
-from transformers.file_utils import cached_property
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow
+from transformers.utils import cached_property
 
 from ..test_tokenization_common import TokenizerTesterMixin
 
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index e082b4b859..c1625036c4 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -113,7 +113,7 @@ def create_dummy_files():
         backend_name = "[" + ", ".join(f'"{b}"' for b in backend.split("_and_")) + "]"
         dummy_file = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n"
         dummy_file += "# flake8: noqa\n"
-        dummy_file += "from ..file_utils import DummyObject, requires_backends\n\n"
+        dummy_file += "from ..utils import DummyObject, requires_backends\n\n"
         dummy_file += "\n".join([create_dummy_object(o, backend_name) for o in objects])
         dummy_files[backend] = dummy_file
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index b20cf7c323..482937bddd 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -23,8 +23,8 @@ from difflib import get_close_matches
 from pathlib import Path
 
 from transformers import is_flax_available, is_tf_available, is_torch_available
-from transformers.file_utils import ENV_VARS_TRUE_VALUES
 from transformers.models.auto import get_values
+from transformers.utils import ENV_VARS_TRUE_VALUES
 
 
 # All paths are set with the intent you should run this script from the root of the repo with the command