Compare commits

..

6 Commits

Author SHA1 Message Date
Patrick von Platen
a5fc34437d [Wav2Vec2] Fix dtype 64 bug (#13517)
Some checks failed
Release - Conda / build_and_package (push) Has been cancelled
* fix

* 2nd fix
2021-09-10 18:20:57 +02:00
Patrick von Platen
2c51442fef Release: v4.10.2 2021-09-10 18:20:33 +02:00
patrickvonplaten
28e278728d Release: 4.10.1
Some checks failed
Release - Conda / build_and_package (push) Has been cancelled
2021-09-10 16:12:44 +02:00
Nicolas Patry
e5e0452c29 Fixing #13381 (#13400)
* Fixing #13381

* Enabling automatic LED models.
2021-09-10 16:11:33 +02:00
Nicolas Patry
4afbd7ebf3 Fixing backward compatiblity for non prefixed tokens (B-, I-). (#13493) 2021-09-10 16:11:25 +02:00
Patrick von Platen
60eb416a13 [Wav2Vec2] Fix normalization for non-padded tensors (#13512)
* finalize

* Apply suggestions from code review

* finish cleaner implementation

* more tests

* small fix

* finish

* up
2021-09-10 16:11:13 +02:00
396 changed files with 3743 additions and 28288 deletions

View File

@@ -557,55 +557,24 @@ jobs:
keys:
- v0.4-torch_examples-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
- run: pip install .[sklearn,torch,sentencepiece,testing]
- run: pip install -r examples/pytorch/_tests_requirements.txt
- save_cache:
key: v0.4-torch_examples-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
- run: python utils/tests_fetcher.py | tee test_preparation.txt
- store_artifacts:
path: ~/transformers/test_preparation.txt
- run: |
if [ -f test_list.txt ]; then
python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee tests_output.txt
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
fi
- store_artifacts:
path: ~/transformers/examples_output.txt
- store_artifacts:
path: ~/transformers/reports
run_examples_torch_all:
working_directory: ~/transformers
docker:
- image: circleci/python:3.6
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
- checkout
- restore_cache:
keys:
- v0.4-torch_examples-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
- run: pip install -r examples/pytorch/_tests_requirements.txt
- save_cache:
key: v0.4-torch_examples-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: |
TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
- store_artifacts:
path: ~/transformers/examples_output.txt
- store_artifacts:
path: ~/transformers/reports
run_tests_hub:
working_directory: ~/transformers
@@ -742,7 +711,7 @@ jobs:
build_doc:
working_directory: ~/transformers
docker:
- image: circleci/python:3.6
- image: circleci/python:3.7.11
resource_class: large
steps:
- checkout
@@ -764,7 +733,7 @@ jobs:
deploy_doc:
working_directory: ~/transformers
docker:
- image: circleci/python:3.6
- image: circleci/python:3.7.11
resource_class: large
steps:
- add_ssh_keys:
@@ -932,7 +901,6 @@ workflows:
only:
- master
jobs:
- run_examples_torch_all
- run_tests_torch_and_tf_all
- run_tests_torch_and_flax_all
- run_tests_torch_all

View File

@@ -70,6 +70,4 @@ deploy_doc "1366172" v4.8.1
deploy_doc "96d1cfb" v4.8.2
deploy_doc "72aee83" v4.9.0
deploy_doc "bff1c71" v4.9.1
deploy_doc "41981a2" v4.9.2
deploy_doc "39cb6f5" v4.10.0
deploy_doc "28e2787" # v4.10.1 Latest stable release
deploy_doc "41981a2" # v4.9.2 Latest stable release

View File

@@ -47,8 +47,6 @@ jobs:
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
make style
python utils/check_table.py --fix_and_overwrite
python utils/check_dummies.py --fix_and_overwrite

View File

@@ -1,257 +0,0 @@
name: Self-hosted runner; Nightly (scheduled)
on:
push:
branches:
- nightly_ci*
repository_dispatch:
schedule:
- cron: "0 0 */3 * *"
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
RUN_SLOW: yes
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
PYTEST_TIMEOUT: 600
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
jobs:
run_all_tests_torch_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev git
pip install --upgrade pip
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
- name: Are GPUs recognized by our DL frameworks
run: |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU
run: |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_gpu_failures_short.txt
- name: Run examples tests on GPU
if: ${{ always() }}
env:
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
RUN_SLOW: yes
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
run: |
pip install -r examples/pytorch/_tests_requirements.txt
python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
- name: Failure short reports
if: ${{ always() }}
run: cat reports/examples_torch_gpu_failures_short.txt
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
RUN_PIPELINE_TESTS: yes
run: |
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_torch_gpu_test_reports
path: reports
run_all_tests_torch_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
continue-on-error: true
run: |
nvidia-smi
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev git
pip install --upgrade pip
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
- name: Are GPUs recognized by our DL frameworks
run: |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU
env:
MKL_SERVICE_FORCE_INTEL: 1
run: |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_multi_gpu_failures_short.txt
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
RUN_PIPELINE_TESTS: yes
run: |
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_torch_multi_gpu_test_reports
path: reports
run_all_tests_torch_cuda_extensions_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: nvcr.io/nvidia/pytorch:21.03-py3
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Install dependencies
run: |
apt -y update && apt install -y libaio-dev
pip install --upgrade pip
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
pip install .[testing,deepspeed]
pip install git+https://github.com/microsoft/DeepSpeed
- name: Are GPUs recognized by our DL frameworks
run: |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU
run: |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_tests_torch_cuda_extensions_gpu_test_reports
path: reports
run_all_tests_torch_cuda_extensions_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: nvcr.io/nvidia/pytorch:21.03-py3
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
continue-on-error: true
run: |
nvidia-smi
- name: Install dependencies
run: |
apt -y update && apt install -y libaio-dev
pip install --upgrade pip
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
pip install .[testing,deepspeed,fairscale]
pip install git+https://github.com/microsoft/DeepSpeed
- name: Are GPUs recognized by our DL frameworks
run: |
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU
run: |
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
path: reports
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [
run_all_tests_torch_gpu,
run_all_tests_torch_multi_gpu,
run_all_tests_torch_cuda_extensions_gpu,
run_all_tests_torch_cuda_extensions_multi_gpu
]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
run: |
pip install slack_sdk
python utils/notification_service.py scheduled nightly-torch

View File

@@ -79,7 +79,7 @@ jobs:
path: reports
run_tests_flax_gpu:
runs-on: [self-hosted, docker-gpu-test, single-gpu]
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

View File

@@ -15,7 +15,6 @@ env:
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
PYTEST_TIMEOUT: 600
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
jobs:
run_all_tests_torch_gpu:
@@ -141,7 +140,7 @@ jobs:
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev git
apt -y update && apt install -y git
pip install --upgrade pip
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]
@@ -252,7 +251,7 @@ jobs:
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev git
apt -y update && apt install -y git
pip install --upgrade pip
pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

View File

@@ -235,17 +235,14 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
@@ -253,7 +250,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -261,19 +258,16 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)**
1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.

View File

@@ -235,11 +235,10 @@ conda install -c huggingface transformers
1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
@@ -256,21 +255,18 @@ conda install -c huggingface transformers
1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval
for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon
Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
1. **[FNet](https://huggingface.co/transformers/master/model_doc/fnet.html)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
@@ -283,19 +279,14 @@ conda install -c huggingface transformers
1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/master/model_doc/speechencoderdecoder.html)**
1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/master/model_doc/speech_to_text_2.html)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。

View File

@@ -247,11 +247,10 @@ conda install -c huggingface transformers
1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
@@ -268,26 +267,23 @@ conda install -c huggingface transformers
1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
1. **[FNet](https://huggingface.co/transformers/master/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -295,19 +291,14 @@ conda install -c huggingface transformers
1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/master/model_doc/speechencoderdecoder.html)**
1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/master/model_doc/speech_to_text_2.html)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.

View File

@@ -1,11 +1,10 @@
// These two things need to be updated at each release for the version selector.
// Last stable version
const stableVersion = "v4.10.1"
const stableVersion = "v4.9.2"
// Dictionary doc folder to label. The last stable version should have an empty key.
const versionMapping = {
"master": "master",
"": "v4.10.0/v4.10.1 (stable)",
"v4.9.2": "v4.9.0/v4.9.1/v4.9.2",
"": "v4.9.0/v4.9.1/v4.9.2 (stable)",
"v4.8.2": "v4.8.0/v4.8.1/v4.8.2",
"v4.7.0": "v4.7.0",
"v4.6.0": "v4.6.0",

View File

@@ -1,143 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
How to add a pipeline to 🤗 Transformers?
=======================================================================================================================
First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
dictionnaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as
possible as it makes compatibility easier (even through other languages via JSON). Those will be the :obj:`inputs` of
the pipeline (:obj:`preprocess`).
Then define the :obj:`outputs`. Same policy as the :obj:`inputs`. The simpler, the better. Those will be the outputs of
:obj:`postprocess` method.
Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to implement :obj:`preprocess`,
:obj:`_forward`, :obj:`postprocess` and :obj:`_sanitize_parameters`.
.. code-block::
from transformers import Pipeline
class MyPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs)
preprocess_kwargs = {}
if "maybe_arg" in kwargs:
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
return preprocess_kwargs, {}, {}
def preprocess(self, inputs, maybe_arg=2)
model_input = Tensor(....)
return {"model_input": model_input}
def _forward(self, model_inputs)
# model_inputs == {"model_input": model_input}
oututs = self.model(**model_inputs)
# Maybe {"logits": Tensor(...)}
return outputs
def postprocess(self, model_outputs)
best_class = model_outputs["logits"].softmax(-1)
return best_class
The structure of this breakdown is to support relatively seemless support for CPU/GPU, while supporting doing
pre/postprocessing on the CPU on different threads
:obj:`preprocess` will take the original defined inputs, and turn them something feedable to the model. It might
contain more information and is usally a :obj:`Dict`.
:obj:`_forward` is the implementation detail and is not meant to be called directly :obj:`forward` is the preferred
called method as it contains safeguards to make sure everything is working on the expected device. If anything is
linked to a real model it belongs in the :obj:`_forward` method, anything else is in the preprocess/postrocess.
:obj:`postprocess` methods will take the output of :obj:`_forward` and turn it into the final output that were decided
earlier.
:obj:`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
time ``pipeline(...., maybe_arg=4)`` or at call time ``pipe = pipeline(...); output = pipe(...., maybe_arg=4)``.
The returns of :obj:`_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to :obj:`preprocess`,
:obj:`_forward` and :obj:`postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
allows to keep the default arguments in the function definition which is always more "natural".
A classic example would be a :obj:`top_k` argument in the post processing in classification tasks.
.. code-block::
>>> pipe = pipeline("my-new-task")
>>> pipe("This is a test")
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
>>> pipe("This is a test", top_k=2)
[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
In order to achieve that, we'll update our :obj:`postprocess` method with a default parameter to :obj:`5`. and edit
:obj:`_sanitize_parameters` to allow this new parameter.
.. code-block::
def postprocess(self, model_outputs, top_k=5)
best_class = model_outputs["logits"].softmax(-1)
# Add logic to handle top_k
return best_class
def _sanitize_parameters(self, **kwargs)
preprocess_kwargs = {}
if "maybe_arg" in kwargs:
preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
postprocess_kwargs = {}
if "top_k" in kwargs:
preprocess_kwargs["top_k"] = kwargs["top_k"]
return preprocess_kwargs, {}, postprocess_kwargs
Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
without requiring users to understand new kind of objects. It's also relatively common to support many different types
of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
Adding it to the list of supported tasks
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Go to ``src/transformers/pipelines/__init__.py`` and fill in :obj:`SUPPORTED_TASKS` with your newly created pipeline.
If possible it should provide a default model.
Adding tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Create a new file ``tests/test_pipelines_MY_PIPELINE.py`` with example with the other tests.
The :obj:`run_pipeline_test` function will be very generic and run on small random models on every possible
architecture as defined by :obj:`model_mapping` and :obj:`tf_model_mapping`.
This is very important to test future compatibilty, meaning if someone adds a new model for
:obj:`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
impossible to check for actual values, that's why There is a helper :obj:`ANY` that will simply attempt to match the
output of the pipeline TYPE.
You also *need* to implement 2 (ideally 4) tests.
- :obj:`test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
and test the pipeline outputs. The results should be the same as :obj:`test_small_model_tf`.
- :obj:`test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
and test the pipeline outputs. The results should be the same as :obj:`test_small_model_pt`.
- :obj:`test_large_model_pt` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
sure there is no drift in future releases
- :obj:`test_large_model_tf` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
sure there is no drift in future releases

View File

@@ -27,10 +27,7 @@ author = "huggingface"
# The short X.Y version
version = ""
# The full version, including alpha/beta/rc tags
release = "4.11.3"
release = "4.10.2"

View File

@@ -176,148 +176,131 @@ Supported models
25. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
26. :doc:`EncoderDecoder <model_doc/encoderdecoder>` (from Google Research) released with the paper `Leveraging
Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
Narayan, Aliaksei Severyn.
27. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
26. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
28. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
27. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
29. :doc:`FNet <model_doc/fnet>` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier
Transforms <https://arxiv.org/abs/2105.03824>`__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
Ontanon.
30. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
28. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
31. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
29. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
and Ilya Sutskever.
32. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
30. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
Luan, Dario Amodei** and Ilya Sutskever**.
33. :doc:`GPT-J <model_doc/gptj>` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax
<https://github.com/kingoflolz/mesh-transformer-jax/>`__ by Ben Wang and Aran Komatsuzaki.
34. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
31. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
<https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
35. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
32. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
Representation Learning by Masked Prediction of Hidden Units <https://arxiv.org/abs/2106.07447>`__ by Wei-Ning Hsu,
Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
36. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
<https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
37. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
33. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
<https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
34. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
38. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
35. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
Multi-modal Pre-training for Visually-Rich Document Understanding <https://arxiv.org/abs/2012.14740>`__ by Yang Xu,
Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min
Zhang, Lidong Zhou.
39. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
36. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
Multimodal Pre-training for Multilingual Visually-rich Document Understanding <https://arxiv.org/abs/2104.08836>`__
by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
40. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
37. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
<https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
41. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
38. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
42. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
39. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
43. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
40. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
by Hao Tan and Mohit Bansal.
44. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
Machine Translation <https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma,
Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal,
Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
45. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
41. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
42. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
Translator Team.
46. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
43. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
47. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
44. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
48. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
45. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
49. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
46. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
50. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
47. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
Jianfeng Lu, Tie-Yan Liu.
51. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
48. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
52. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__ by Jingqing Zhang, Yao Zhao,
49. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
Mohammad Saleh and Peter J. Liu.
53. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
50. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
54. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
51. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
55. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
52. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
pre-trained language models <https://arxiv.org/pdf/2010.12821.pdf>`__ by Hyung Won Chung, Thibault Févry, Henry
Tsai, M. Johnson, Sebastian Ruder.
56. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
53. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
57. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
54. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
58. :doc:`SpeechEncoderDecoder <model_doc/speechencoderdecoder>`
59. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
55. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
`fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
60. :doc:`SpeechToTextTransformer2 <model_doc/speech_to_text_2>` (from Facebook), released together with the paper
`Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
61. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
56. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
Jonathan Berant, Amir Globerson, Omer Levy.
62. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
vision teach NLP about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola,
Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
63. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
57. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
Krishna, and Kurt W. Keutzer.
58. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
64. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
`google-research/text-to-text-transfer-transformer
<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__ by
Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi
Zhou and Wei Li and Peter J. Liu.
65. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
59. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
Francesco Piccinno and Julian Martin Eisenschlos.
66. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
60. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
67. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
61. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
68. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
62. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
69. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
63. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
Zhou, Abdelrahman Mohamed, Michael Auli.
70. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
64. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
71. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
65. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
72. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
66. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
Zettlemoyer and Veselin Stoyanov.
73. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
67. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
74. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
68. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
@@ -341,7 +324,7 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BART | ✅ | ✅ | ✅ | ✅ | ✅ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BeiT | ❌ | ❌ | ✅ | ❌ | |
| BeiT | ❌ | ❌ | ✅ | ❌ | |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BERT | ✅ | ✅ | ✅ | ✅ | ✅ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -353,7 +336,7 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Blenderbot | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BlenderbotSmall | ✅ | | ✅ | ✅ | ❌ |
| BlenderbotSmall | ✅ | | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -385,14 +368,10 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| FNet | ✅ | ✅ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| GPT Neo | ❌ | ❌ | ✅ | ❌ | ✅ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| GPT-J | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Hubert | ❌ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ |
@@ -427,7 +406,7 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| OpenAI GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Pegasus | ✅ | ✅ | ✅ | ✅ | |
| Pegasus | ✅ | ✅ | ✅ | ✅ | |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -443,12 +422,8 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| RoFormer | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech2Text | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech2Text2 | ✅ | ❌ | ❌ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Splinter | ✅ | ✅ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
@@ -510,7 +485,6 @@ Flax), PyTorch, and/or TensorFlow.
migration
contributing
add_new_model
add_new_pipeline
fast_tokenizers
performance
parallelism
@@ -579,7 +553,6 @@ Flax), PyTorch, and/or TensorFlow.
model_doc/electra
model_doc/encoderdecoder
model_doc/flaubert
model_doc/fnet
model_doc/fsmt
model_doc/funnel
model_doc/herbert
@@ -601,7 +574,6 @@ Flax), PyTorch, and/or TensorFlow.
model_doc/mt5
model_doc/gpt
model_doc/gpt2
model_doc/gptj
model_doc/gpt_neo
model_doc/hubert
model_doc/pegasus
@@ -613,13 +585,10 @@ Flax), PyTorch, and/or TensorFlow.
model_doc/retribert
model_doc/roberta
model_doc/roformer
model_doc/speechencoderdecoder
model_doc/speech_to_text
model_doc/speech_to_text_2
model_doc/splinter
model_doc/squeezebert
model_doc/t5
model_doc/t5v1.1
model_doc/tapas
model_doc/transformerxl
model_doc/vit

View File

@@ -17,11 +17,6 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met
either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
from HuggingFace's AWS S3 repository).
Each derived config class implements model specific attributes. Common attributes present in all config classes are:
:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
:obj:`vocab_size`.
PretrainedConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -1728,7 +1728,7 @@ For example for a pretrained model:
.. code-block:: python
from transformers.deepspeed import HfDeepSpeedConfig
from transformers import AutoModel, deepspeed
from transformers import AugoModel
ds_config = { ... } # deepspeed config object or path to the file
# must run before instantiating the model
@@ -1741,7 +1741,7 @@ or for non-pretrained model:
.. code-block:: python
from transformers.deepspeed import HfDeepSpeedConfig
from transformers import AutoModel, AutoConfig, deepspeed
from transformers import AugoModel, AutoConfig
ds_config = { ... } # deepspeed config object or path to the file
# must run before instantiating the model

View File

@@ -23,22 +23,20 @@ There are two categories of pipeline abstractions to be aware about:
- The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
- The other task-specific pipelines:
- :class:`~transformers.AudioClassificationPipeline`
- :class:`~transformers.AutomaticSpeechRecognitionPipeline`
- :class:`~transformers.ConversationalPipeline`
- :class:`~transformers.FeatureExtractionPipeline`
- :class:`~transformers.FillMaskPipeline`
- :class:`~transformers.ImageClassificationPipeline`
- :class:`~transformers.ObjectDetectionPipeline`
- :class:`~transformers.QuestionAnsweringPipeline`
- :class:`~transformers.SummarizationPipeline`
- :class:`~transformers.TableQuestionAnsweringPipeline`
- :class:`~transformers.TextClassificationPipeline`
- :class:`~transformers.TextGenerationPipeline`
- :class:`~transformers.Text2TextGenerationPipeline`
- :class:`~transformers.TokenClassificationPipeline`
- :class:`~transformers.TranslationPipeline`
- :class:`~transformers.ZeroShotClassificationPipeline`
- :class:`~transformers.Text2TextGenerationPipeline`
- :class:`~transformers.TableQuestionAnsweringPipeline`
The pipeline abstraction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -46,60 +44,12 @@ The pipeline abstraction
The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
pipeline but requires an additional argument which is the `task`.
Simple call on one item:
.. code-block::
>>> pipe = pipeline("text-classification")
>>> pipe("This restaurant is awesome")
[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
To call a pipeline on many items, you can either call with a `list`.
.. code-block::
>>> pipe = pipeline("text-classification")
>>> pipe(["This restaurant is awesome", "This restaurant is aweful"])
[{'label': 'POSITIVE', 'score': 0.9998743534088135},
{'label': 'NEGATIVE', 'score': 0.9996669292449951}]
To iterate of full datasets it is recommended to use a :obj:`dataset` directly. This means you don't need to allocate
the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
GPU. If it doesn't don't hesitate to create an issue.
.. code-block::
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
dataset = datasets.load_dataset("superb", name="asr", split="test")
# KeyDataset (only `pt`) will simply return the item in the dict returned by the dataset item
# as we're not interested in the `target` part of the dataset.
for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))):
print(out)
# {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
# {"text": ....}
# ....
.. autofunction:: transformers.pipeline
Implementing a pipeline
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:doc:`Implementing a new pipeline <../add_new_pipeline>`
The task specific pipelines
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AudioClassificationPipeline
=======================================================================================================================
.. autoclass:: transformers.AudioClassificationPipeline
:special-members: __call__
:members:
AutomaticSpeechRecognitionPipeline
=======================================================================================================================
@@ -144,13 +94,6 @@ NerPipeline
See :class:`~transformers.TokenClassificationPipeline` for all details.
ObjectDetectionPipeline
=======================================================================================================================
.. autoclass:: transformers.ObjectDetectionPipeline
:special-members: __call__
:members:
QuestionAnsweringPipeline
=======================================================================================================================

View File

@@ -64,9 +64,9 @@ classification:
class MultilabelTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.get("labels")
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.get('logits')
logits = outputs.logits
loss_fct = nn.BCEWithLogitsLoss()
loss = loss_fct(logits.view(-1, self.model.config.num_labels),
labels.float().view(-1, self.model.config.num_labels))
@@ -119,29 +119,6 @@ TFTrainingArguments
:members:
Checkpoints
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
By default, :class:`~transformers.Trainer` will save all checkpoints in the :obj:`output_dir` you set in the
:class:`~transformers.TrainingArguments` you are using. Those will go in subfolder named :obj:`checkpoint-xxx` with xxx
being the step at which the training was at.
Resuming training from a checkpoint can be done when calling :meth:`~transformers.Trainer.train` with either:
- :obj:`resume_from_checkpoint=True` which will resume training from the latest checkpoint
- :obj:`resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
passed.
In addition, you can easily save your checkpoints on the Model Hub when using :obj:`push_to_hub=True`. By default, all
the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
the :obj:`hub-strategy` value of your :class:`~transformers.TrainingArguments` to either:
- :obj:`"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
resume training easily with :obj:`trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
- :obj:`"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
checkpoint folder per folder in your final repository)
Logging
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -135,34 +135,6 @@ AutoModelForImageClassification
:members:
AutoModelForAudioClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModelForAudioClassification
:members:
AutoModelForCTC
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModelForCTC
:members:
AutoModelForSpeechSeq2Seq
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModelForSpeechSeq2Seq
:members:
AutoModelForObjectDetection
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModelForObjectDetection
:members:
TFAutoModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -59,8 +59,7 @@ Tips:
:obj:`use_relative_position_bias` attribute of :class:`~transformers.BeitConfig` to :obj:`True` in order to add
position embeddings.
This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The JAX/FLAX version of this model was
contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
<https://github.com/microsoft/unilm/tree/master/beit>`__.
BeitConfig
@@ -96,24 +95,3 @@ BeitForImageClassification
.. autoclass:: transformers.BeitForImageClassification
:members: forward
FlaxBeitModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBeitModel
:members: __call__
FlaxBeitForMaskedImageModeling
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBeitForMaskedImageModeling
:members: __call__
FlaxBeitForImageClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBeitForImageClassification
:members: __call__

View File

@@ -57,13 +57,6 @@ BlenderbotSmallTokenizer
create_token_type_ids_from_sequences, save_vocabulary
BlenderbotSmallTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BlenderbotSmallTokenizerFast
:members:
BlenderbotSmallModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -39,11 +39,8 @@ experiments.*
This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
found `here <https://github.com/google-research/byt5>`__.
ByT5's architecture is based on the T5v1.1 model, so one can refer to :doc:`T5v1.1's documentation page <t5v1.1>`. They
only differ in how inputs should be prepared for the model, see the code examples below.
Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
ByT5's architecture is based on the T5 model, so one can refer to :doc:`T5's documentation page <t5>`.
Example

View File

@@ -46,7 +46,7 @@ Tips:
This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. This model jax version was
contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found :prefix_link:`here
<examples/research_projects/distillation>`.
<examples/research-projects/distillation>`.
DistilBertConfig

View File

@@ -41,13 +41,6 @@ DPRConfig
:members:
DPRPreTrainedModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DPRPreTrainedModel
:members:
DPRContextEncoderTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -1,121 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
FNet
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The FNet model was proposed in `FNet: Mixing Tokens with Fourier Transforms <https://arxiv.org/abs/2105.03824>`__ by
James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. The model replaces the self-attention layer in a BERT
model with a fourier transform which returns only the real parts of the transform. The model is significantly faster
than the BERT model because it has fewer parameters and is more memory efficient. The model achieves about 92-97%
accuracy of BERT counterparts on GLUE benchmark, and trains much faster than the BERT model. The abstract from the
paper is the following:
*We show that Transformer encoder architectures can be sped up, with limited accuracy costs, by replacing the
self-attention sublayers with simple linear transformations that "mix" input tokens. These linear mixers, along with
standard nonlinearities in feed-forward layers, prove competent at modeling semantic relationships in several text
classification tasks. Most surprisingly, we find that replacing the self-attention sublayer in a Transformer encoder
with a standard, unparameterized Fourier Transform achieves 92-97% of the accuracy of BERT counterparts on the GLUE
benchmark, but trains 80% faster on GPUs and 70% faster on TPUs at standard 512 input lengths. At longer input lengths,
our FNet model is significantly faster: when compared to the "efficient" Transformers on the Long Range Arena
benchmark, FNet matches the accuracy of the most accurate models, while outpacing the fastest models across all
sequence lengths on GPUs (and across relatively shorter lengths on TPUs). Finally, FNet has a light memory footprint
and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models
outperform Transformer counterparts.*
Tips on usage:
- The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with
maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum
sequence length for fine-tuning and inference.
This model was contributed by `gchhablani <https://huggingface.co/gchhablani>`__. The original code can be found `here
<https://github.com/google-research/google-research/tree/master/f_net>`__.
FNetConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetConfig
:members:
FNetTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
FNetTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetTokenizerFast
:members:
FNetModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetModel
:members: forward
FNetForPreTraining
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetForPreTraining
:members: forward
FNetForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetForMaskedLM
:members: forward
FNetForNextSentencePrediction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetForNextSentencePrediction
:members: forward
FNetForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetForSequenceClassification
:members: forward
FNetForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetForMultipleChoice
:members: forward
FNetForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetForTokenClassification
:members: forward
FNetForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FNetForQuestionAnswering
:members: forward

View File

@@ -36,11 +36,10 @@ Tips:
- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
observed in the `run_generation.py` example script.
- The model can take the `past_key_values` (for PyTorch) or `past` (for TF) as input, which is the previously computed
key/value attention pairs. Using this (`past_key_values` or `past`) value prevents the model from re-computing
pre-computed values in the context of text generation. For PyTorch, see `past_key_values` argument of the
:meth:`~transformers.GPT2Model.forward` method, or for TF the `past` argument of the
:meth:`~transformers.TFGPT2Model.call` method for more information on its usage.
- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
`reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
this argument.
`Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five

View File

@@ -1,121 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
GPT-J
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The GPT-J model was released in the `kingoflolz/mesh-transformer-jax
<https://github.com/kingoflolz/mesh-transformer-jax>`__ repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like
causal language model trained on `the Pile <https://pile.eleuther.ai/>`__ dataset.
This model was contributed by `Stella Biderman <https://huggingface.co/stellaathena>`__.
Tips:
- To load `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ in float32 one would need at least 2x model size CPU
RAM: 1x for initial weights and another 1x to load the checkpoint. So for GPT-J it would take at least 48GB of CPU
RAM to just load the model. To reduce the CPU RAM usage there are a few options. The ``torch_dtype`` argument can be
used to initialize the model in half-precision. And the ``low_cpu_mem_usage`` argument can be used to keep the RAM
usage to 1x. There is also a `fp16 branch <https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16>`__ which stores
the fp16 weights, which could be used to further minimize the RAM usage. Combining all this it should take roughly
12.1GB of CPU RAM to load the model.
.. code-block::
>>> from transformers import GPTJForCausalLM
>>> import torch
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
optimizer for example makes four copies of the model: model, gradients, average and squared average of the gradients.
So it would need at least 4x model size GPU memory, even with mixed precision as gradient updates are in fp32. This
is not including the activations and data batches, which would again require some more GPU RAM. So one should explore
solutions such as DeepSpeed, to train/fine-tune the model. Another option is to use the original codebase to
train/fine-tune the model on TPU and then convert the model to Transformers format for inference. Instructions for
that could be found `here <https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md>`__
- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
tokens are added for the sake of efficiency on TPUs. To avoid the mis-match between embedding matrix size and vocab
size, the tokenizer for `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ contains 143 extra tokens
``<|extratoken_1|>... <|extratoken_143|>``, so the ``vocab_size`` of tokenizer also becomes 50400.
Generation
_______________________________________________________________________________________________________________________
The :meth:`~transformers.generation_utils.GenerationMixin.generate` method can be used to generate text using GPT-J
model.
.. code-block::
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
... "researchers was the fact that the unicorns spoke perfect English."
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
...or in float16 precision:
.. code-block::
>>> from transformers import GPTJForCausalLM, AutoTokenizer
>>> import torch
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
... "researchers was the fact that the unicorns spoke perfect English."
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
GPTJConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GPTJConfig
:members:
GPTJModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GPTJModel
:members: forward
GPTJForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GPTJForCausalLM
:members: forward
GPTJForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GPTJForSequenceClassification
:members: forward

View File

@@ -40,15 +40,6 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
initialize it as follows:
.. code-block::
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base')
As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
<layoutlmv2>` for all tips, code examples and notebooks.

View File

@@ -46,8 +46,8 @@ Tips:
- LED makes use of *global attention* by means of the ``global_attention_mask`` (see
:class:`~transformers.LongformerModel`). For summarization, it is advised to put *global attention* only on the first
``<s>`` token. For question answering, it is advised to put *global attention* on all tokens of the question.
- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by executing
``model.gradient_checkpointing_enable()``.
- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by setting
``config.gradient_checkpointing = True``.
- A notebook showing how to evaluate LED, can be accessed `here
<https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing>`__.
- A notebook showing how to fine-tune LED, can be accessed `here

View File

@@ -49,11 +49,11 @@ inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokeniz
>>> from transformers import MBartForConditionalGeneration, MBartTokenizer
>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
>>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt", src_lang="en_XX", tgt_lang="ro_RO")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_romanian, return_tensors="pt")

View File

@@ -10,7 +10,7 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
mT5
MT5
-----------------------------------------------------------------------------------------------------------------------
Overview
@@ -24,28 +24,9 @@ The abstract from the paper is the following:
*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We describe
the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
checkpoints used in this work are publicly available.*
Note: mT5 was only pre-trained on `mC4 <https://huggingface.co/datasets/mc4>`__ excluding any supervised training.
Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5 model.
Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
Google has released the following variants:
- `google/mt5-small <https://huggingface.co/google/mt5-small>`__
- `google/mt5-base <https://huggingface.co/google/mt5-base>`__
- `google/mt5-large <https://huggingface.co/google/mt5-large>`__
- `google/mt5-xl <https://huggingface.co/google/mt5-xl>`__
- `google/mt5-xxl <https://huggingface.co/google/mt5-xxl>`__.
benchmarks. All of the code and model checkpoints*
This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
found `here <https://github.com/google-research/multilingual-t5>`__.

View File

@@ -152,17 +152,3 @@ TFPegasusForConditionalGeneration
.. autoclass:: transformers.TFPegasusForConditionalGeneration
:members: call
FlaxPegasusModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxPegasusModel
:members: __call__, encode, decode
FlaxPegasusForConditionalGeneration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxPegasusForConditionalGeneration
:members: __call__, encode, decode

View File

@@ -1,123 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
Speech2Text2
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Speech2Text2 model is used together with :doc:`Wav2Vec2 <wav2vec2>` for Speech Translation models proposed in
`Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
:doc:`Wav2Vec2 <wav2vec2>` or :doc:`HuBERT <hubert>` for Speech-to-Text tasks. Please refer to the
:doc:`SpeechEncoderDecoder <speechencoderdecoder>` class on how to combine Speech2Text2 with any speech *encoder-only*
model.
This model was contributed by `Patrick von Platen <https://huggingface.co/patrickvonplaten>`__.
The original code can be found `here
<https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266>`__.
Tips:
- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
the `official models <https://huggingface.co/models?other=speech2text2>`__ .
- Speech2Text2 is always used within the :doc:`SpeechEncoderDecoder <speechencoderdecoder>` framework.
- Speech2Text2's tokenizer currently only supports inference, but not training.
Inference
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Speech2Text2's :class:`~transformers.SpeechEncoderDecoderModel` model accepts raw waveform input values from speech and
makes use of :func:`~transformers.generation_utils.GenerationMixin.generate` to translate the input speech
autoregressively to the target language.
The :class:`~transformers.Wav2Vec2FeatureExtractor` class is responsible for preprocessing the input speech and
:class:`~transformers.Speech2Text2Tokenizer` decodes the generated target tokens to the target string. The
:class:`~transformers.Speech2Text2Processor` wraps :class:`~transformers.Wav2Vec2FeatureExtractor` and
:class:`~transformers.Speech2Text2Tokenizer` into a single instance to both extract the input features and decode the
predicted token ids.
- Step-by-step Speech Translation
.. code-block::
>>> import torch
>>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
>>> generated_ids = model.generate(input_ids=inputs["input_values"], attention_mask=inputs["attention_mask"])
>>> transcription = processor.batch_decode(generated_ids)
- Speech Translation via Pipelines
The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
.. code-block::
>>> from datasets import load_dataset
>>> from transformers import pipeline
>>> librispeech_en = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
>>> translation_de = asr(librispeech_en[0]["file"])
See `model hub <https://huggingface.co/models?filter=speech2text2>`__ to look for Speech2Text2 checkpoints.
Speech2Text2Config
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2Text2Config
:members:
Speech2TextTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2Text2Tokenizer
:members: batch_decode, decode, save_vocabulary
Speech2Text2Processor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2Text2Processor
:members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
Speech2Text2ForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2Text2ForCausalLM
:members: forward

View File

@@ -1,40 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
Speech Encoder Decoder Models
-----------------------------------------------------------------------------------------------------------------------
The :class:`~transformers.SpeechEncoderDecoderModel` can be used to initialize a speech-sequence-to-text-sequence model
with any pretrained speech autoencoding model as the encoder (*e.g.* :doc:`Wav2Vec2 <wav2vec2>`, :doc:`Hubert
<hubert>`) and any pretrained autoregressive model as the decoder.
The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
recognition and speech translation has *e.g.* been shown in `Large-Scale Self- and Semi-Supervised Learning for Speech
Translation <https://arxiv.org/abs/2104.06678>`__ by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
Alexis Conneau.
An example of how to use a :class:`~transformers.SpeechEncoderDecoderModel` for inference can be seen in
:doc:`Speech2Text2 <speech_to_text_2>`.
SpeechEncoderDecoderConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.SpeechEncoderDecoderConfig
:members:
SpeechEncoderDecoderModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.SpeechEncoderDecoderModel
:members: forward, from_encoder_decoder_pretrained

View File

@@ -13,6 +13,9 @@
T5
-----------------------------------------------------------------------------------------------------------------------
**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a `Github Issue
<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -39,56 +42,28 @@ Tips:
different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
for summarization: *summarize: ...*.
- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
- See the :ref:`training`, :ref:`inference` and :ref:`scripts` sections below for all details regarding usage.
T5 comes in different sizes:
- `t5-small <https://huggingface.co/t5-small>`__
- `t5-base <https://huggingface.co/t5-base>`__
- `t5-large <https://huggingface.co/t5-large>`__
- `t5-3b <https://huggingface.co/t5-3b>`__
- `t5-11b <https://huggingface.co/t5-11b>`__.
Based on the original T5 model, Google has released some follow-up works:
- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found :doc:`here <t5v1.1>`.
- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
the documentation of mT5 which can be found :doc:`here <mt5>`.
- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
to the documentation of byT5 which can be found :doc:`here <byt5>`.
All checkpoints can be found on the `hub <https://huggingface.co/models?search=t5>`__.
For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
<https://arxiv.org/pdf/1910.10683.pdf>`__. - For sequence-to-sequence generation, it is recommended to use
:meth:`~transformers.generation_utils.GenerationMixin.generate`. This method takes care of feeding the encoded input
via cross-attention layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative
scalar embeddings. Encoder input padding can be done on the left and on the right.
This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
<https://github.com/google-research/text-to-text-transfer-transformer>`__.
.. _training:
Training
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
sequence is fed to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a
start-sequence token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target
sequence is then appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the
start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
One can use :class:`~transformers.T5ForConditionalGeneration` (or the Tensorflow/Flax variant), which includes the
language modeling head on top of the decoder.
forcing. This means that for training we always need an input sequence and a target sequence. The input sequence is fed
to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a start-sequence
token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target sequence is then
appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the start-sequence
token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
- Unsupervised denoising training
In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
sentinel token represents a unique mask token for this sentence and should start with :obj:`<extra_id_0>`,
:obj:`<extra_id_1>`, ... up to :obj:`<extra_id_99>`. As a default, 100 sentinel tokens are available in
@@ -97,201 +72,34 @@ language modeling head on top of the decoder.
For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
processed as follows:
.. code-block::
.. code-block::
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
If you're interested in pre-training T5 on a new corpus, check out the `run_t5_mlm_flax.py
<https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling>`__ script in the Examples
directory.
input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
- Supervised training
In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
the model as follows:
.. code-block::
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
As you can see, only 2 inputs are required for the model in order to compute a loss: :obj:`input_ids` (which are the
:obj:`input_ids` of the encoded input sequence) and :obj:`labels` (which are the :obj:`input_ids` of the encoded
target sequence). The model will automatically create the :obj:`decoder_input_ids` based on the :obj:`labels`, by
shifting them one position to the right and prepending the :obj:`config.decoder_start_token_id`, which for T5 is
equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
during T5's pre-training.
However, the example above only shows a single training example. In practice, one trains deep learning models in
batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
typically defines a :obj:`max_source_length` and :obj:`max_target_length`, which determine the maximum length of the
input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
the task.
In addition, we must make sure that padding token id's of the :obj:`labels` are not taken into account by the loss
function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the :obj:`ignore_index`
of the :obj:`CrossEntropyLoss`. In Flax, one can use the :obj:`decoder_attention_mask` to ignore padded tokens from
the loss (see the `Flax summarization script
<https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__ for details). We also pass
:obj:`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
ignored. The code example below illustrates all of this.
.. code-block::
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128
# Suppose we have the following 2 training examples:
input_sequence_1 = "Welcome to NYC"
output_sequence_1 = "Bienvenue à NYC"
input_sequence_2 = "HuggingFace is a company"
output_sequence_2 = "HuggingFace est une entreprise"
# encode the inputs
task_prefix = "translate English to French: "
input_sequences = [input_sequence_1, input_sequence_2]
encoding = tokenizer([task_prefix + sequence for sequence in input_sequences],
padding='longest',
max_length=max_source_length,
truncation=True,
return_tensors="pt")
input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
# encode the targets
target_encoding = tokenizer([output_sequence_1, output_sequence_2],
padding='longest',
max_length=max_target_length,
truncation=True)
labels = target_encoding.input_ids
# replace padding token id's of the labels by -100
labels = [
[(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
]
labels = torch.tensor(labels)
# forward pass
loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
Additional training tips:
- T5 models need a slightly higher learning rate than the default one set in the :obj:`Trainer` when using the AdamW
optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
- According to `this forum post <https://discuss.huggingface.co/t/t5-finetuning-tips/684>`__, task prefixes matter when
(1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
pre-training mixture (see Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`__ for the task prefixes
used).
- If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
`pad_to_multiple_of` to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
encountered during training thus significantly slowing down the training. only padding up to the longest example in a
batch) leads to very slow training on TPU.
.. _inference:
Inference
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
At inference time, it is recommended to use :meth:`~transformers.generation_utils.GenerationMixin.generate`. This
method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
and auto-regressively generates the decoder output. Check out `this blog post
<https://huggingface.co/blog/how-to-generate>`__ to know all the details about generating text with Transformers.
There's also `this blog post <https://huggingface.co/blog/encoder-decoder#encoder-decoder>`__ which explains how
generation works in general in encoder-decoder models.
In this setup the input sequence and output sequence are standard sequence-to-sequence input output mapping. In
translation, for instance with the input sequence "The house is wonderful." and output sequence "Das Haus ist
wunderbar.", the sentences should be processed as follows:
.. code-block::
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import T5ForConditionalGeneration, T5Tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# Das Haus ist wunderbar.
Note that T5 uses the :obj:`pad_token_id` as the :obj:`decoder_start_token_id`, so when doing generation without using
:meth:`~transformers.generation_utils.GenerationMixin.generate`, make sure you start it with the :obj:`pad_token_id`.
The example above only shows a single example. You can also do batched inference, like so:
.. code-block::
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token # to avoid an error
task_prefix = 'translate English to German: '
sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
output_sequences = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
do_sample=False, # disable sampling to test if batching affects output
)
print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
# ['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
.. _scripts:
Example scripts
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
T5 is supported by several example scripts, both for pre-training and fine-tuning.
* pre-training: the `run_t5_mlm_flax.py
<https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/run_t5_mlm_flax.py>`__
script allows you to further pre-train T5 or pre-train T5 from scratch on your own data. The `t5_tokenizer_model.py
<https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/t5_tokenizer_model.py>`__
script allows you to further train a T5 tokenizer or train a T5 Tokenizer from scratch on your own data. Note that
Flax (a neural network library on top of JAX) is particularly useful to train on TPU hardware.
* fine-tuning: T5 is supported by the official summarization scripts (`PyTorch
<https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization>`__, `Tensorflow
<https://github.com/huggingface/transformers/tree/master/examples/tensorflow/summarization>`__, and `Flax
<https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__) and translation scripts
(`PyTorch <https://github.com/huggingface/transformers/tree/master/examples/pytorch/translation>`__ and `Tensorflow
<https://github.com/huggingface/transformers/tree/master/examples/tensorflow/translation>`__). These scripts allow
you to easily fine-tune T5 on custom data for summarization/translation.
T5Config
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -1,66 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
T5v1.1
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
T5v1.1 was released in the `google-research/text-to-text-transfer-transformer
<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__
repository by Colin Raffel et al. It's an improved version of the original T5 model.
One can directly plug in the weights of T5v1.1 into a T5 model, like so:
.. code-block::
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base')
T5 Version 1.1 includes the following improvements compared to the original T5 model:
- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See `this paper
<https://arxiv.org/abs/2002.05202>`__.
- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
- Pre-trained on C4 only without mixing in the downstream tasks.
- No parameter sharing between the embedding and classifier layer.
- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger :obj:`d_model` and smaller
:obj:`num_heads` and :obj:`d_ff`.
Note: T5 Version 1.1 was only pre-trained on `C4 <https://huggingface.co/datasets/c4>`__ excluding any supervised
training. Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5
model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
Google has released the following variants:
- `google/t5-v1_1-small <https://huggingface.co/google/t5-v1_1-small>`__
- `google/t5-v1_1-base <https://huggingface.co/google/t5-v1_1-base>`__
- `google/t5-v1_1-large <https://huggingface.co/google/t5-v1_1-large>`__
- `google/t5-v1_1-xl <https://huggingface.co/google/t5-v1_1-xl>`__
- `google/t5-v1_1-xxl <https://huggingface.co/google/t5-v1_1-xxl>`__.
One can refer to :doc:`T5's documentation page <t5>` for all tips, code examples and notebooks.
This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
found `here
<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__.

View File

@@ -341,8 +341,8 @@ Add a model card
To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
titled "Add a README.md" on your model page. A model card documentation can be found `here
<https://huggingface.co/docs/hub/model-repos>`__ (meta-suggestions are welcome). model card template (meta-suggestions
titled "Add a README.md" on your model page. A model card template can be found `here
<https://github.com/huggingface/model_card>`__ (meta-suggestions are welcome). model card template (meta-suggestions
are welcome).
.. note::

View File

@@ -53,7 +53,6 @@ Software:
- Tensor Parallelism
- Low-memory Optimizers
- fp16/bf16 (smaller data)
- Gradient checkpointing
@@ -227,21 +226,6 @@ pytorch `autocast` which performs AMP include a caching feature, which speed thi
Autocast maintains a cache of the FP16 casts of model params (leaves). This helps streamline parameter reuse: if the same FP32 param is used in several different FP16list ops, like several matmuls, instead of re-casting the param to FP16 on entering each matmul, the cast will occur on the first matmul, the casted FP16 copy will be cached, and for all later matmuls the FP16 copy will be reused. The cache is maintained only within a particular outermost autocast context. When you exit the autocast context the cache is dropped. For recommended usage, in which autocast wraps the forward pass, and then you exit the context before calling backward(), this means the cache only lasts the duration of the forward pass each iteration, and will be rebuilt next iteration. (The cache of FP16-casted copies MUST be rebuilt each iteration. The FP32 params get updated by the optimizer, so the FP16 copies must be recreated, otherwise the FP16 values will be stale.)
### Gradient Checkpointing
One way to use significantly less GPU memory is to enabled "Gradient Checkpointing" (also known as "activation checkpointing"). When enabled, a lot of memory can be freed at the cost of small decrease in the training speed due to recomputing parts of the graph during back-propagation.
This technique was first shared in the paper: [Training Deep Nets with Sublinear Memory Cost](https://arxiv.org/abs/1604.06174). The paper will also give you the exact details on the savings, but it's in the ballpark of `O(sqrt(n))`, where `n` is the number of feed-forward layers.
To activate this feature in 🤗 Transformers for models that support it, use:
```python
model.gradient_checkpointing_enable()
```
or add `--gradient_checkpointing` to the Trainer arguments.
### Batch sizes
One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.

View File

@@ -100,7 +100,7 @@ dataset in memory.
test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')
With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average negative
With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average
log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in
the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating
as context to be included in our loss, so we can set these targets to ``-100`` so that they are ignored. The following
@@ -110,13 +110,10 @@ available to condition on).
.. code-block:: python
import torch
from tqdm import tqdm
max_length = model.config.n_positions
stride = 512
nlls = []
lls = []
for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
begin_loc = max(i + stride - max_length, 0)
end_loc = min(i + stride, encodings.input_ids.size(1))
@@ -127,11 +124,11 @@ available to condition on).
with torch.no_grad():
outputs = model(input_ids, labels=target_ids)
neg_log_likelihood = outputs[0] * trg_len
log_likelihood = outputs[0] * trg_len
nlls.append(neg_log_likelihood)
lls.append(log_likelihood)
ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
ppl = torch.exp(torch.stack(lls).sum() / end_loc)
Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction,

View File

@@ -67,8 +67,8 @@ make them readable. For instance:
>>> classifier('We are very happy to show you the 🤗 Transformers library.')
[{'label': 'POSITIVE', 'score': 0.9998}]
That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model, returning
a list of dictionaries like this one:
That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
`batch`, returning a list of dictionaries like this one:
.. code-block::
@@ -79,8 +79,6 @@ a list of dictionaries like this one:
label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309
To use with a large dataset, look at :doc:`iterating over a pipeline <./main_classes/pipelines>`
You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
fairly neutral.

View File

@@ -42,7 +42,6 @@ Ready-made configurations include the following models:
- BERT
- DistilBERT
- GPT-2
- LayoutLM
- RoBERTa
- T5
- XLM-RoBERTa

View File

@@ -33,7 +33,7 @@ Preparing the datasets
frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
picture-in-picture" allowfullscreen></iframe>
We will use the `🤗 Datasets <https://github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
We will use the `🤗 Datasets <https:/github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
datasets. We will go over this part pretty quickly. Since the focus of this tutorial is on training, you should refer
to the 🤗 Datasets `documentation <https://huggingface.co/docs/datasets/>`__ or the :doc:`preprocessing` tutorial for
more information.

View File

@@ -46,8 +46,6 @@ module abstraction using Python dataclasses that leads to concise and explicit c
All of our JAX/Flax models are designed to run efficiently on Google
Cloud TPUs. Here is [a guide for running JAX on Google Cloud TPU](https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm).
Consider applying for the [Google TPU Research Cloud project](https://sites.research.google/trc/) for free TPU compute.
Each example README contains more details on the specific model and training
procedure.

View File

@@ -92,7 +92,7 @@ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=
])
# Save files to disk
tokenizer.save("./tokenizer.json")
tokenizer.save("./")
```
### Create configuration
@@ -241,7 +241,7 @@ Finally, we can run the example script to pretrain the model:
```bash
./run_clm_flax.py \
--output_dir="./" \
--output_dir="./l" \
--model_type="gpt2" \
--config_name="./" \
--tokenizer_name="./" \

View File

@@ -1,128 +0,0 @@
<!---
Copyright 2021 The Google Flax Team Authors and HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Question Answering examples
Based on the script [`run_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/flax/question-answering/run_qa.py).
**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version
of the script.
The following example fine-tunes BERT on SQuAD:
To begin with it is recommended to create a model repository to save the trained model and logs.
Here we call the model `"bert-qa-squad-test"`, but you can change the model name as you like.
You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
you are logged in) or via the command line:
```
huggingface-cli repo create bert-qa-squad-test
```
Next we clone the model repository to add the tokenizer and model files.
```
git clone https://huggingface.co/<your-username>/bert-qa-squad-test
```
Great, we have set up our model repository. During training, we will automatically
push the training logs and model weights to the repo.
Next, let's add a symbolic link to the `run_qa.py`.
```bash
export MODEL_DIR="./bert-qa-squad-test"
ln -s ~/transformers/examples/flax/question-answering/run_qa.py run_qa.py
```
```bash
python run_qa.py \
--model_name_or_path bert-base-uncased \
--dataset_name squad \
--do_train \
--do_eval \
--max_seq_length 384 \
--doc_stride 128 \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--per_device_train_batch_size 12 \
--output_dir ${MODEL_DIR} \
--eval_steps 1000 \
--push_to_hub
```
Using the command above, the script will train for 2 epochs and run eval after each epoch.
Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`.
You can see the results by running `tensorboard` in that directory:
```bash
$ tensorboard --logdir .
```
or directly on the hub under *Training metrics*.
Training with the previously defined hyper-parameters yields the following results:
```bash
f1 = 88.62
exact_match = 81.34
```
sample Metrics - [tfhub.dev](https://tensorboard.dev/experiment/6gU75Hx8TGCnc6tr4ZgI9Q)
Here is an example training on 4 TITAN RTX GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3
python run_qa.py \
--model_name_or_path bert-large-uncased-whole-word-masking \
--dataset_name squad \
--do_train \
--do_eval \
--per_device_train_batch_size 6 \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--max_seq_length 384 \
--doc_stride 128 \
--output_dir /tmp/wwm_uncased_finetuned_squad/ \
--eval_steps 1000
```
Training with the previously defined hyper-parameters yields the following results:
```bash
f1 = 93.31
exact_match = 87.04
```
### Usage notes
Note that when contexts are long they may be split into multiple training cases, not all of which may contain
the answer span.
As-is, the example script will train on SQuAD or any other question-answering dataset formatted the same way, and can handle user
inputs as well.
### Memory usage and data loading
One thing to note is that all data is loaded into memory in this script. Most question answering datasets are small
enough that this is not an issue, but if you have a very large dataset you will need to modify the script to handle
data streaming.

View File

@@ -1,5 +0,0 @@
datasets >= 1.8.0
jax>=0.2.17
jaxlib>=0.1.68
flax>=0.3.4
optax>=0.0.8

View File

@@ -1,905 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for question answering.
"""
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
import logging
import os
import random
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from typing import Any, Callable, Dict, Optional, Tuple
import datasets
import numpy as np
from datasets import load_dataset, load_metric
from tqdm import tqdm
import jax
import jax.numpy as jnp
import optax
import transformers
from flax import struct, traverse_util
from flax.jax_utils import replicate, unreplicate
from flax.metrics import tensorboard
from flax.training import train_state
from flax.training.common_utils import get_metrics, onehot, shard
from transformers import (
AutoConfig,
AutoTokenizer,
EvalPrediction,
FlaxAutoModelForQuestionAnswering,
HfArgumentParser,
PreTrainedTokenizerFast,
TrainingArguments,
)
from transformers.utils import check_min_version
from utils_qa import postprocess_qa_predictions
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
Array = Any
Dataset = datasets.arrow_dataset.Dataset
PRNGKey = Any
# region Arguments
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
"with private models)."
},
)
dtype: Optional[str] = field(
default="float32",
metadata={
"help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
},
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: Optional[str] = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
validation_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
)
test_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=384,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
pad_to_max_length: bool = field(
default=False,
metadata={
"help": "Whether to pad all samples to `max_seq_length`. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
"be faster on GPU but will be slower on TPU)."
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
max_predict_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
"value if set."
},
)
version_2_with_negative: bool = field(
default=False, metadata={"help": "If true, some of the examples do not have an answer."}
)
null_score_diff_threshold: float = field(
default=0.0,
metadata={
"help": "The threshold used to select the null answer: if the best answer has a score that is less than "
"the score of the null answer minus this threshold, the null answer is selected for this example. "
"Only useful when `version_2_with_negative=True`."
},
)
doc_stride: int = field(
default=128,
metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
)
n_best_size: int = field(
default=20,
metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
)
max_answer_length: int = field(
default=30,
metadata={
"help": "The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another."
},
)
def __post_init__(self):
if (
self.dataset_name is None
and self.train_file is None
and self.validation_file is None
and self.test_file is None
):
raise ValueError("Need either a dataset name or a training/validation file/test_file.")
else:
if self.train_file is not None:
extension = self.train_file.split(".")[-1]
assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if self.test_file is not None:
extension = self.test_file.split(".")[-1]
assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
# endregion
# region Create a train state
def create_train_state(
model: FlaxAutoModelForQuestionAnswering,
learning_rate_fn: Callable[[int], float],
num_labels: int,
training_args: TrainingArguments,
) -> train_state.TrainState:
"""Create initial training state."""
class TrainState(train_state.TrainState):
"""Train state with an Optax optimizer.
The two functions below differ depending on whether the task is classification
or regression.
Args:
logits_fn: Applied to last layer to obtain the logits.
loss_fn: Function to compute the loss.
"""
logits_fn: Callable = struct.field(pytree_node=False)
loss_fn: Callable = struct.field(pytree_node=False)
# We use Optax's "masking" functionality to not apply weight decay
# to bias and LayerNorm scale parameters. decay_mask_fn returns a
# mask boolean with the same structure as the parameters.
# The mask is True for parameters that should be decayed.
# Note that this mask is specifically adapted for FlaxBERT-like models.
# For other models, one should correct the layer norm parameter naming
# accordingly.
def decay_mask_fn(params):
flat_params = traverse_util.flatten_dict(params)
flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
return traverse_util.unflatten_dict(flat_mask)
tx = optax.adamw(
learning_rate=learning_rate_fn,
b1=training_args.adam_beta1,
b2=training_args.adam_beta2,
eps=training_args.adam_epsilon,
weight_decay=training_args.weight_decay,
mask=decay_mask_fn,
)
def cross_entropy_loss(logits, labels):
start_loss = optax.softmax_cross_entropy(logits[0], onehot(labels[0], num_classes=num_labels))
end_loss = optax.softmax_cross_entropy(logits[1], onehot(labels[1], num_classes=num_labels))
xentropy = (start_loss + end_loss) / 2.0
return jnp.mean(xentropy)
return TrainState.create(
apply_fn=model.__call__,
params=model.params,
tx=tx,
logits_fn=lambda logits: logits,
loss_fn=cross_entropy_loss,
)
# endregion
# region Create learning rate function
def create_learning_rate_fn(
train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
) -> Callable[[int], jnp.array]:
"""Returns a linear warmup, linear_decay learning rate function."""
steps_per_epoch = train_ds_size // train_batch_size
num_train_steps = steps_per_epoch * num_train_epochs
warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
decay_fn = optax.linear_schedule(
init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
)
schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
return schedule_fn
# endregion
# region train data iterator
def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
"""Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
steps_per_epoch = len(dataset) // batch_size
perms = jax.random.permutation(rng, len(dataset))
perms = perms[: steps_per_epoch * batch_size] # Skip incomplete batch.
perms = perms.reshape((steps_per_epoch, batch_size))
for perm in perms:
batch = dataset[perm]
batch = {k: np.array(v) for k, v in batch.items()}
batch = shard(batch)
yield batch
# endregion
# region eval data iterator
def eval_data_collator(dataset: Dataset, batch_size: int):
"""Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
for i in range(len(dataset) // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size]
batch = {k: np.array(v) for k, v in batch.items()}
batch = shard(batch)
yield batch
# endregion
def main():
# region Argument parsing
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# endregion
# region Logging
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
# Setup logging, we only want one process per machine to log things on the screen.
logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
if jax.process_index() == 0:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
# endregion
# region Load Data
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else:
# Loading the dataset from local csv or json file.
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.validation_file.split(".")[-1]
if data_args.test_file is not None:
data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# endregion
# region Load pretrained model and tokenizer
#
# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=True,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# endregion
# region Tokenizer check: this script requires a fast tokenizer.
if not isinstance(tokenizer, PreTrainedTokenizerFast):
raise ValueError(
"This example script only works for models that have a fast tokenizer. Checkout the big table of models "
"at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
"requirement"
)
# endregion
# region Preprocessing the datasets
# Preprocessing is slightly different for training and evaluation.
if training_args.do_train:
column_names = raw_datasets["train"].column_names
elif training_args.do_eval:
column_names = raw_datasets["validation"].column_names
else:
column_names = raw_datasets["test"].column_names
question_column_name = "question" if "question" in column_names else column_names[0]
context_column_name = "context" if "context" in column_names else column_names[1]
answer_column_name = "answers" if "answers" in column_names else column_names[2]
# Padding side determines if we do (question|context) or (context|question).
pad_on_right = tokenizer.padding_side == "right"
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warning(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
# Training preprocessing
def prepare_train_features(examples):
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
# left whitespace
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
# in one example possible giving several features when a context is long, each of those features having a
# context that overlaps a bit the context of the previous feature.
tokenized_examples = tokenizer(
examples[question_column_name if pad_on_right else context_column_name],
examples[context_column_name if pad_on_right else question_column_name],
truncation="only_second" if pad_on_right else "only_first",
max_length=max_seq_length,
stride=data_args.doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
# Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that.
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# The offset mappings will give us a map from token to character position in the original context. This will
# help us compute the start_positions and end_positions.
offset_mapping = tokenized_examples.pop("offset_mapping")
# Let's label those examples!
tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
for i, offsets in enumerate(offset_mapping):
# We will label impossible answers with the index of the CLS token.
input_ids = tokenized_examples["input_ids"][i]
cls_index = input_ids.index(tokenizer.cls_token_id)
# Grab the sequence corresponding to that example (to know what is the context and what is the question).
sequence_ids = tokenized_examples.sequence_ids(i)
# One example can give several spans, this is the index of the example containing this span of text.
sample_index = sample_mapping[i]
answers = examples[answer_column_name][sample_index]
# If no answers are given, set the cls_index as answer.
if len(answers["answer_start"]) == 0:
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
else:
# Start/end character index of the answer in the text.
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])
# Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
token_start_index += 1
# End token index of the current span in the text.
token_end_index = len(input_ids) - 1
while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
token_end_index -= 1
# Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
else:
# Otherwise move the token_start_index and token_end_index to the two ends of the answer.
# Note: we could go after the last offset if the answer is the last word (edge case).
while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
token_start_index += 1
tokenized_examples["start_positions"].append(token_start_index - 1)
while offsets[token_end_index][1] >= end_char:
token_end_index -= 1
tokenized_examples["end_positions"].append(token_end_index + 1)
return tokenized_examples
processed_raw_datasets = dict()
if training_args.do_train:
if "train" not in raw_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
# We will select sample from whole data if agument is specified
train_dataset = train_dataset.select(range(data_args.max_train_samples))
# Create train feature from dataset
train_dataset = train_dataset.map(
prepare_train_features,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache,
)
if data_args.max_train_samples is not None:
# Number of samples might increase during Feature Creation, We select only specified max samples
train_dataset = train_dataset.select(range(data_args.max_train_samples))
processed_raw_datasets["train"] = train_dataset
# Validation preprocessing
def prepare_validation_features(examples):
# Some of the questions have lots of whitespace on the left, which is not useful and will make the
# truncation of the context fail (the tokenized question will take a lots of space). So we remove that
# left whitespace
examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
# in one example possible giving several features when a context is long, each of those features having a
# context that overlaps a bit the context of the previous feature.
tokenized_examples = tokenizer(
examples[question_column_name if pad_on_right else context_column_name],
examples[context_column_name if pad_on_right else question_column_name],
truncation="only_second" if pad_on_right else "only_first",
max_length=max_seq_length,
stride=data_args.doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
# Since one example might give us several features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that.
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
# corresponding example_id and we will store the offset mappings.
tokenized_examples["example_id"] = []
for i in range(len(tokenized_examples["input_ids"])):
# Grab the sequence corresponding to that example (to know what is the context and what is the question).
sequence_ids = tokenized_examples.sequence_ids(i)
context_index = 1 if pad_on_right else 0
# One example can give several spans, this is the index of the example containing this span of text.
sample_index = sample_mapping[i]
tokenized_examples["example_id"].append(examples["id"][sample_index])
# Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
# position is part of the context or not.
tokenized_examples["offset_mapping"][i] = [
(o if sequence_ids[k] == context_index else None)
for k, o in enumerate(tokenized_examples["offset_mapping"][i])
]
return tokenized_examples
if training_args.do_eval:
if "validation" not in raw_datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_examples = raw_datasets["validation"]
if data_args.max_eval_samples is not None:
# We will select sample from whole data
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
# Validation Feature Creation
eval_dataset = eval_examples.map(
prepare_validation_features,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache,
)
if data_args.max_eval_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
processed_raw_datasets["validation"] = eval_dataset
if training_args.do_predict:
if "test" not in raw_datasets:
raise ValueError("--do_predict requires a test dataset")
predict_examples = raw_datasets["test"]
if data_args.max_predict_samples is not None:
# We will select sample from whole data
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
# Predict Feature Creation
predict_dataset = predict_examples.map(
prepare_validation_features,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache,
)
if data_args.max_predict_samples is not None:
# During Feature creation dataset samples might increase, we will select required samples again
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
processed_raw_datasets["test"] = predict_dataset
# endregion
# region Metrics and Post-processing:
def post_processing_function(examples, features, predictions, stage="eval"):
# Post-processing: we match the start logits and end logits to answers in the original context.
predictions = postprocess_qa_predictions(
examples=examples,
features=features,
predictions=predictions,
version_2_with_negative=data_args.version_2_with_negative,
n_best_size=data_args.n_best_size,
max_answer_length=data_args.max_answer_length,
null_score_diff_threshold=data_args.null_score_diff_threshold,
output_dir=training_args.output_dir,
prefix=stage,
)
# Format the result to the format the metric expects.
if data_args.version_2_with_negative:
formatted_predictions = [
{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
]
else:
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
return EvalPrediction(predictions=formatted_predictions, label_ids=references)
metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
# Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
"""
Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
Args:
start_or_end_logits(:obj:`tensor`):
This is the output predictions of the model. We can only enter either start or end logits.
eval_dataset: Evaluation dataset
max_len(:obj:`int`):
The maximum length of the output tensor. ( See the model.eval() part for more details )
"""
step = 0
# create a numpy array and fill it with -100.
logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
# Now since we have create an array now we will populate it with the outputs of the model.
for i, output_logit in enumerate(start_or_end_logits): # populate columns
# We have to fill it such that we have to take the whole tensor and replace it on the newly created array
# And after every iteration we have to change the step
batch_size = output_logit.shape[0]
cols = output_logit.shape[1]
if step + batch_size < len(dataset):
logits_concat[step : step + batch_size, :cols] = output_logit
else:
logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
step += batch_size
return logits_concat
# endregion
# region Training steps and logging init
train_dataset = processed_raw_datasets["train"]
eval_dataset = processed_raw_datasets["validation"]
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# Define a summary writer
summary_writer = tensorboard.SummaryWriter(training_args.output_dir)
summary_writer.hparams({**training_args.to_dict(), **vars(model_args), **vars(data_args)})
def write_train_metric(summary_writer, train_metrics, train_time, step):
summary_writer.scalar("train_time", train_time, step)
train_metrics = get_metrics(train_metrics)
for key, vals in train_metrics.items():
tag = f"train_{key}"
for i, val in enumerate(vals):
summary_writer.scalar(tag, val, step - len(vals) + i + 1)
def write_eval_metric(summary_writer, eval_metrics, step):
for metric_name, value in eval_metrics.items():
summary_writer.scalar(f"eval_{metric_name}", value, step)
num_epochs = int(training_args.num_train_epochs)
rng = jax.random.PRNGKey(training_args.seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())
train_batch_size = training_args.per_device_train_batch_size * jax.local_device_count()
eval_batch_size = training_args.per_device_eval_batch_size * jax.local_device_count()
# endregion
# region Load model
model = FlaxAutoModelForQuestionAnswering.from_pretrained(
model_args.model_name_or_path,
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
seed=training_args.seed,
dtype=getattr(jnp, model_args.dtype),
)
learning_rate_fn = create_learning_rate_fn(
len(train_dataset),
train_batch_size,
training_args.num_train_epochs,
training_args.warmup_steps,
training_args.learning_rate,
)
state = create_train_state(model, learning_rate_fn, num_labels=max_seq_length, training_args=training_args)
# endregion
# region Define train step functions
def train_step(
state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey
) -> Tuple[train_state.TrainState, float]:
"""Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`."""
dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
start_positions = batch.pop("start_positions")
end_positions = batch.pop("end_positions")
targets = (start_positions, end_positions)
def loss_fn(params):
logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)
loss = state.loss_fn(logits, targets)
return loss
grad_fn = jax.value_and_grad(loss_fn)
loss, grad = grad_fn(state.params)
grad = jax.lax.pmean(grad, "batch")
new_state = state.apply_gradients(grads=grad)
metrics = jax.lax.pmean({"loss": loss, "learning_rate": learning_rate_fn(state.step)}, axis_name="batch")
return new_state, metrics, new_dropout_rng
p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0,))
# endregion
# region Define eval step functions
def eval_step(state, batch):
logits = state.apply_fn(**batch, params=state.params, train=False)
return state.logits_fn(logits)
p_eval_step = jax.pmap(eval_step, axis_name="batch")
# endregion
# region Define train and eval loop
logger.info(f"===== Starting training ({num_epochs} epochs) =====")
train_time = 0
# make sure weights are replicated on each device
state = replicate(state)
train_time = 0
step_per_epoch = len(train_dataset) // train_batch_size
total_steps = step_per_epoch * num_epochs
epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
for epoch in epochs:
train_start = time.time()
train_metrics = []
# Create sampling rng
rng, input_rng = jax.random.split(rng)
# train
for step, batch in enumerate(
tqdm(
train_data_collator(input_rng, train_dataset, train_batch_size),
total=step_per_epoch,
desc="Training...",
position=1,
),
1,
):
state, train_metric, dropout_rngs = p_train_step(state, batch, dropout_rngs)
train_metrics.append(train_metric)
cur_step = epoch * step_per_epoch + step
if cur_step % training_args.logging_steps == 0 and cur_step > 0:
# Save metrics
train_metric = unreplicate(train_metric)
train_time += time.time() - train_start
if jax.process_index() == 0:
write_train_metric(summary_writer, train_metrics, train_time, cur_step)
epochs.write(
f"Step... ({cur_step}/{total_steps} | Training Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
)
train_metrics = []
if (
training_args.do_eval
and (cur_step % training_args.eval_steps == 0 or cur_step % step_per_epoch == 0)
and cur_step > 0
):
eval_metrics = {}
all_start_logits = []
all_end_logits = []
# evaluate
for batch in tqdm(
eval_data_collator(eval_dataset, eval_batch_size),
total=len(eval_dataset) // eval_batch_size,
desc="Evaluating ...",
position=2,
):
_ = batch.pop("example_id")
_ = batch.pop("offset_mapping")
predictions = p_eval_step(state, batch)
start_logits = np.array([pred for pred in chain(*predictions[0])])
end_logits = np.array([pred for pred in chain(*predictions[1])])
all_start_logits.append(start_logits)
all_end_logits.append(end_logits)
# evaluate also on leftover examples (not divisible by batch_size)
num_leftover_samples = len(eval_dataset) % eval_batch_size
# make sure leftover batch is evaluated on one device
if num_leftover_samples > 0 and jax.process_index() == 0:
# take leftover samples
batch = eval_dataset[-num_leftover_samples:]
batch = {k: np.array(v) for k, v in batch.items()}
_ = batch.pop("example_id")
_ = batch.pop("offset_mapping")
predictions = eval_step(unreplicate(state), batch)
start_logits = np.array([pred for pred in predictions[0]])
end_logits = np.array([pred for pred in predictions[1]])
all_start_logits.append(start_logits)
all_end_logits.append(end_logits)
max_len = max([x.shape[1] for x in all_start_logits]) # Get the max_length of the tensor
# concatenate the numpy array
start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
# delete the list of numpy arrays
del all_start_logits
del all_end_logits
outputs_numpy = (start_logits_concat, end_logits_concat)
prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
eval_metrics = compute_metrics(prediction)
logger.info(f"Step... ({cur_step}/{total_steps} | Evaluation metrics: {eval_metrics})")
if jax.process_index() == 0:
write_eval_metric(summary_writer, eval_metrics, cur_step)
if (cur_step % training_args.save_steps == 0 and cur_step > 0) or (cur_step == total_steps):
# save checkpoint after each epoch and push checkpoint to the hub
if jax.process_index() == 0:
params = jax.device_get(unreplicate(state.params))
model.save_pretrained(
training_args.output_dir,
params=params,
push_to_hub=training_args.push_to_hub,
commit_message=f"Saving weights and logs of step {cur_step}",
)
epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}"
# endregion
if __name__ == "__main__":
main()

View File

@@ -1,427 +0,0 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Post-processing utilities for question answering.
"""
import collections
import json
import logging
import os
from typing import Optional, Tuple
import numpy as np
from tqdm.auto import tqdm
logger = logging.getLogger(__name__)
def postprocess_qa_predictions(
examples,
features,
predictions: Tuple[np.ndarray, np.ndarray],
version_2_with_negative: bool = False,
n_best_size: int = 20,
max_answer_length: int = 30,
null_score_diff_threshold: float = 0.0,
output_dir: Optional[str] = None,
prefix: Optional[str] = None,
log_level: Optional[int] = logging.WARNING,
):
"""
Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
original contexts. This is the base postprocessing functions for models that only return start and end logits.
Args:
examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the underlying dataset contains examples with no answers.
n_best_size (:obj:`int`, `optional`, defaults to 20):
The total number of n-best predictions to generate when looking for an answer.
max_answer_length (:obj:`int`, `optional`, defaults to 30):
The maximum length of an answer that can be generated. This is needed because the start and end predictions
are not conditioned on one another.
null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
The threshold used to select the null answer: if the best answer has a score that is less than the score of
the null answer minus this threshold, the null answer is selected for this example (note that the score of
the null answer for an example giving several features is the minimum of the scores for the null answer on
each feature: all features must be aligned on the fact they `want` to predict a null answer).
Only useful when :obj:`version_2_with_negative` is :obj:`True`.
output_dir (:obj:`str`, `optional`):
If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
:obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
``logging`` log level (e.g., ``logging.WARNING``)
"""
assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
all_start_logits, all_end_logits = predictions
assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features."
# Build a map example to its corresponding features.
example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
features_per_example[example_id_to_index[feature["example_id"]]].append(i)
# The dictionaries we have to fill.
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
if version_2_with_negative:
scores_diff_json = collections.OrderedDict()
# Logging.
logger.setLevel(log_level)
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
# Let's loop over all the examples!
for example_index, example in enumerate(tqdm(examples)):
# Those are the indices of the features associated to the current example.
feature_indices = features_per_example[example_index]
min_null_prediction = None
prelim_predictions = []
# Looping through all the features associated to the current example.
for feature_index in feature_indices:
# We grab the predictions of the model for this feature.
start_logits = all_start_logits[feature_index]
end_logits = all_end_logits[feature_index]
# This is what will allow us to map some the positions in our logits to span of texts in the original
# context.
offset_mapping = features[feature_index]["offset_mapping"]
# Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
# available in the current feature.
token_is_max_context = features[feature_index].get("token_is_max_context", None)
# Update minimum null prediction.
feature_null_score = start_logits[0] + end_logits[0]
if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
min_null_prediction = {
"offsets": (0, 0),
"score": feature_null_score,
"start_logit": start_logits[0],
"end_logit": end_logits[0],
}
# Go through all possibilities for the `n_best_size` greater start and end logits.
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
for start_index in start_indexes:
for end_index in end_indexes:
# Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
# to part of the input_ids that are not in the context.
if (
start_index >= len(offset_mapping)
or end_index >= len(offset_mapping)
or offset_mapping[start_index] is None
or offset_mapping[end_index] is None
):
continue
# Don't consider answers with a length that is either < 0 or > max_answer_length.
if end_index < start_index or end_index - start_index + 1 > max_answer_length:
continue
# Don't consider answer that don't have the maximum context available (if such information is
# provided).
if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
continue
prelim_predictions.append(
{
"offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
"score": start_logits[start_index] + end_logits[end_index],
"start_logit": start_logits[start_index],
"end_logit": end_logits[end_index],
}
)
if version_2_with_negative:
# Add the minimum null prediction
prelim_predictions.append(min_null_prediction)
null_score = min_null_prediction["score"]
# Only keep the best `n_best_size` predictions.
predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
# Add back the minimum null prediction if it was removed because of its low score.
if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
predictions.append(min_null_prediction)
# Use the offsets to gather the answer text in the original context.
context = example["context"]
for pred in predictions:
offsets = pred.pop("offsets")
pred["text"] = context[offsets[0] : offsets[1]]
# In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
# failure.
if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
# Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
# the LogSumExp trick).
scores = np.array([pred.pop("score") for pred in predictions])
exp_scores = np.exp(scores - np.max(scores))
probs = exp_scores / exp_scores.sum()
# Include the probabilities in our predictions.
for prob, pred in zip(probs, predictions):
pred["probability"] = prob
# Pick the best prediction. If the null answer is not possible, this is easy.
if not version_2_with_negative:
all_predictions[example["id"]] = predictions[0]["text"]
else:
# Otherwise we first need to find the best non-empty prediction.
i = 0
while predictions[i]["text"] == "":
i += 1
best_non_null_pred = predictions[i]
# Then we compare to the null prediction using the threshold.
score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable.
if score_diff > null_score_diff_threshold:
all_predictions[example["id"]] = ""
else:
all_predictions[example["id"]] = best_non_null_pred["text"]
# Make `predictions` JSON-serializable by casting np.float back to float.
all_nbest_json[example["id"]] = [
{k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
for pred in predictions
]
# If we have an output_dir, let's save all those dicts.
if output_dir is not None:
assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
prediction_file = os.path.join(
output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
)
nbest_file = os.path.join(
output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
)
if version_2_with_negative:
null_odds_file = os.path.join(
output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
)
logger.info(f"Saving predictions to {prediction_file}.")
with open(prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
logger.info(f"Saving nbest_preds to {nbest_file}.")
with open(nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
logger.info(f"Saving null_odds to {null_odds_file}.")
with open(null_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
return all_predictions
def postprocess_qa_predictions_with_beam_search(
examples,
features,
predictions: Tuple[np.ndarray, np.ndarray],
version_2_with_negative: bool = False,
n_best_size: int = 20,
max_answer_length: int = 30,
start_n_top: int = 5,
end_n_top: int = 5,
output_dir: Optional[str] = None,
prefix: Optional[str] = None,
log_level: Optional[int] = logging.WARNING,
):
"""
Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
cls token predictions.
Args:
examples: The non-preprocessed dataset (see the main script for more information).
features: The processed dataset (see the main script for more information).
predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
first dimension must match the number of elements of :obj:`features`.
version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the underlying dataset contains examples with no answers.
n_best_size (:obj:`int`, `optional`, defaults to 20):
The total number of n-best predictions to generate when looking for an answer.
max_answer_length (:obj:`int`, `optional`, defaults to 30):
The maximum length of an answer that can be generated. This is needed because the start and end predictions
are not conditioned on one another.
start_n_top (:obj:`int`, `optional`, defaults to 5):
The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
end_n_top (:obj:`int`, `optional`, defaults to 5):
The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
output_dir (:obj:`str`, `optional`):
If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
:obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
answers, are saved in `output_dir`.
prefix (:obj:`str`, `optional`):
If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
``logging`` log level (e.g., ``logging.WARNING``)
"""
assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
assert len(predictions[0]) == len(
features
), f"Got {len(predictions[0])} predicitions and {len(features)} features."
# Build a map example to its corresponding features.
example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
features_per_example[example_id_to_index[feature["example_id"]]].append(i)
# The dictionaries we have to fill.
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
# Logging.
logger.setLevel(log_level)
logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
# Let's loop over all the examples!
for example_index, example in enumerate(tqdm(examples)):
# Those are the indices of the features associated to the current example.
feature_indices = features_per_example[example_index]
min_null_score = None
prelim_predictions = []
# Looping through all the features associated to the current example.
for feature_index in feature_indices:
# We grab the predictions of the model for this feature.
start_log_prob = start_top_log_probs[feature_index]
start_indexes = start_top_index[feature_index]
end_log_prob = end_top_log_probs[feature_index]
end_indexes = end_top_index[feature_index]
feature_null_score = cls_logits[feature_index]
# This is what will allow us to map some the positions in our logits to span of texts in the original
# context.
offset_mapping = features[feature_index]["offset_mapping"]
# Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
# available in the current feature.
token_is_max_context = features[feature_index].get("token_is_max_context", None)
# Update minimum null prediction
if min_null_score is None or feature_null_score < min_null_score:
min_null_score = feature_null_score
# Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
for i in range(start_n_top):
for j in range(end_n_top):
start_index = int(start_indexes[i])
j_index = i * end_n_top + j
end_index = int(end_indexes[j_index])
# Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
# p_mask but let's not take any risk)
if (
start_index >= len(offset_mapping)
or end_index >= len(offset_mapping)
or offset_mapping[start_index] is None
or offset_mapping[end_index] is None
):
continue
# Don't consider answers with a length negative or > max_answer_length.
if end_index < start_index or end_index - start_index + 1 > max_answer_length:
continue
# Don't consider answer that don't have the maximum context available (if such information is
# provided).
if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
continue
prelim_predictions.append(
{
"offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
"score": start_log_prob[i] + end_log_prob[j_index],
"start_log_prob": start_log_prob[i],
"end_log_prob": end_log_prob[j_index],
}
)
# Only keep the best `n_best_size` predictions.
predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
# Use the offsets to gather the answer text in the original context.
context = example["context"]
for pred in predictions:
offsets = pred.pop("offsets")
pred["text"] = context[offsets[0] : offsets[1]]
# In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
# failure.
if len(predictions) == 0:
predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
# Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
# the LogSumExp trick).
scores = np.array([pred.pop("score") for pred in predictions])
exp_scores = np.exp(scores - np.max(scores))
probs = exp_scores / exp_scores.sum()
# Include the probabilities in our predictions.
for prob, pred in zip(probs, predictions):
pred["probability"] = prob
# Pick the best prediction and set the probability for the null answer.
all_predictions[example["id"]] = predictions[0]["text"]
if version_2_with_negative:
scores_diff_json[example["id"]] = float(min_null_score)
# Make `predictions` JSON-serializable by casting np.float back to float.
all_nbest_json[example["id"]] = [
{k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
for pred in predictions
]
# If we have an output_dir, let's save all those dicts.
if output_dir is not None:
assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
prediction_file = os.path.join(
output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
)
nbest_file = os.path.join(
output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
)
if version_2_with_negative:
null_odds_file = os.path.join(
output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
)
logger.info(f"Saving predictions to {prediction_file}.")
with open(prediction_file, "w") as writer:
writer.write(json.dumps(all_predictions, indent=4) + "\n")
logger.info(f"Saving nbest_preds to {nbest_file}.")
with open(nbest_file, "w") as writer:
writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
if version_2_with_negative:
logger.info(f"Saving null_odds to {null_odds_file}.")
with open(null_odds_file, "w") as writer:
writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
return all_predictions, scores_diff_json

View File

@@ -1,74 +0,0 @@
<!---
Copyright 2021 The Google Flax Team Authors and HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Token classification examples
Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech tagging (POS) or phrase extraction (CHUNKS). The main script run_flax_ner.py leverages the 🤗 Datasets library. You can easily customize it to your needs if you need extra processing on your datasets.
It will either run on a datasets hosted on our hub or with your own text files for training and validation, you might just need to add some tweaks in the data preprocessing.
The following example fine-tunes BERT on CoNLL-2003:
To begin with it is recommended to create a model repository to save the trained model and logs.
Here we call the model `"bert-ner-conll2003-test"`, but you can change the model name as you like.
You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
you are logged in) or via the command line:
```
huggingface-cli repo create bert-ner-conll2003-test
```
Next we clone the model repository to add the tokenizer and model files.
```
git clone https://huggingface.co/<your-username>/bert-ner-conll2003-test
```
Great, we have set up our model repository. During training, we will automatically
push the training logs and model weights to the repo.
Next, let's add a symbolic link to the `run_flax_ner.py`.
```bash
export MODEL_DIR="./bert-ner-conll2003-test"
ln -s ~/transformers/examples/flax/token-classification/run_flax_ner.py run_flax_ner.py
```
```bash
python run_flax_ner.py \
--model_name_or_path bert-base-cased \
--dataset_name conll2003 \
--max_seq_length 128 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--per_device_train_batch_size 4 \
--output_dir ${MODEL_DIR} \
--eval_steps 300 \
--push_to_hub
```
Using the command above, the script will train for 3 epochs and run eval after each epoch.
Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`.
You can see the results by running `tensorboard` in that directory:
```bash
$ tensorboard --logdir .
```
or directly on the hub under *Training metrics*.
sample Metrics - [tfhub.dev](https://tensorboard.dev/experiment/u52qsBIpQSKEEXEJd2LVYA)

View File

@@ -1,6 +0,0 @@
datasets >= 1.8.0
jax>=0.2.8
jaxlib>=0.1.59
flax>=0.3.4
optax>=0.0.8
seqeval

View File

@@ -1,669 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Fine-tuning a 🤗 Flax Transformers model on token classification tasks (NER, POS, CHUNKS)"""
import logging
import os
import random
import sys
import time
from dataclasses import dataclass, field
from itertools import chain
from typing import Any, Callable, Dict, Optional, Tuple
import datasets
import numpy as np
from datasets import ClassLabel, load_dataset, load_metric
from tqdm import tqdm
import jax
import jax.numpy as jnp
import optax
import transformers
from flax import struct, traverse_util
from flax.jax_utils import replicate, unreplicate
from flax.metrics import tensorboard
from flax.training import train_state
from flax.training.common_utils import get_metrics, onehot, shard
from transformers import (
AutoConfig,
AutoTokenizer,
FlaxAutoModelForTokenClassification,
HfArgumentParser,
TrainingArguments,
)
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
Array = Any
Dataset = datasets.arrow_dataset.Dataset
PRNGKey = Any
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
tokenizer_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
use_auth_token: bool = field(
default=False,
metadata={
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
"with private models)."
},
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
dataset_name: Optional[str] = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_file: Optional[str] = field(
default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
)
validation_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
)
test_file: Optional[str] = field(
default=None,
metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
)
text_column_name: Optional[str] = field(
default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
)
label_column_name: Optional[str] = field(
default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=None,
metadata={
"help": "The maximum total input sequence length after tokenization. If set, sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
max_predict_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
"value if set."
},
)
label_all_tokens: bool = field(
default=False,
metadata={
"help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
"one (in which case the other tokens will have a padding index)."
},
)
return_entity_level_metrics: bool = field(
default=False,
metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
raise ValueError("Need either a dataset name or a training/validation file.")
else:
if self.train_file is not None:
extension = self.train_file.split(".")[-1]
assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
self.task_name = self.task_name.lower()
def create_train_state(
model: FlaxAutoModelForTokenClassification,
learning_rate_fn: Callable[[int], float],
num_labels: int,
training_args: TrainingArguments,
) -> train_state.TrainState:
"""Create initial training state."""
class TrainState(train_state.TrainState):
"""Train state with an Optax optimizer.
The two functions below differ depending on whether the task is classification
or regression.
Args:
logits_fn: Applied to last layer to obtain the logits.
loss_fn: Function to compute the loss.
"""
logits_fn: Callable = struct.field(pytree_node=False)
loss_fn: Callable = struct.field(pytree_node=False)
# We use Optax's "masking" functionality to not apply weight decay
# to bias and LayerNorm scale parameters. decay_mask_fn returns a
# mask boolean with the same structure as the parameters.
# The mask is True for parameters that should be decayed.
# Note that this mask is specifically adapted for FlaxBERT-like models.
# For other models, one should correct the layer norm parameter naming
# accordingly.
def decay_mask_fn(params):
flat_params = traverse_util.flatten_dict(params)
flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
return traverse_util.unflatten_dict(flat_mask)
tx = optax.adamw(
learning_rate=learning_rate_fn,
b1=training_args.adam_beta1,
b2=training_args.adam_beta2,
eps=training_args.adam_epsilon,
weight_decay=training_args.weight_decay,
mask=decay_mask_fn,
)
def cross_entropy_loss(logits, labels):
xentropy = optax.softmax_cross_entropy(logits, onehot(labels, num_classes=num_labels))
return jnp.mean(xentropy)
return TrainState.create(
apply_fn=model.__call__,
params=model.params,
tx=tx,
logits_fn=lambda logits: logits.argmax(-1),
loss_fn=cross_entropy_loss,
)
def create_learning_rate_fn(
train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
) -> Callable[[int], jnp.array]:
"""Returns a linear warmup, linear_decay learning rate function."""
steps_per_epoch = train_ds_size // train_batch_size
num_train_steps = steps_per_epoch * num_train_epochs
warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
decay_fn = optax.linear_schedule(
init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
)
schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
return schedule_fn
def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
"""Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
steps_per_epoch = len(dataset) // batch_size
perms = jax.random.permutation(rng, len(dataset))
perms = perms[: steps_per_epoch * batch_size] # Skip incomplete batch.
perms = perms.reshape((steps_per_epoch, batch_size))
for perm in perms:
batch = dataset[perm]
batch = {k: np.array(v) for k, v in batch.items()}
batch = shard(batch)
yield batch
def eval_data_collator(dataset: Dataset, batch_size: int):
"""Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
for i in range(len(dataset) // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size]
batch = {k: np.array(v) for k, v in batch.items()}
batch = shard(batch)
yield batch
def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
# Setup logging, we only want one process per machine to log things on the screen.
logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
if jax.process_index() == 0:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
# 'tokens' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
)
else:
# Loading the dataset from local csv or json file.
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = (data_args.train_file if data_args.train_file is not None else data_args.valid_file).split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html.
if raw_datasets["train"] is not None:
column_names = raw_datasets["train"].column_names
features = raw_datasets["train"].features
else:
column_names = raw_datasets["validation"].column_names
features = raw_datasets["validation"].features
if data_args.text_column_name is not None:
text_column_name = data_args.text_column_name
elif "tokens" in column_names:
text_column_name = "tokens"
else:
text_column_name = column_names[0]
if data_args.label_column_name is not None:
label_column_name = data_args.label_column_name
elif f"{data_args.task_name}_tags" in column_names:
label_column_name = f"{data_args.task_name}_tags"
else:
label_column_name = column_names[1]
# In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
# unique labels.
def get_label_list(labels):
unique_labels = set()
for label in labels:
unique_labels = unique_labels | set(label)
label_list = list(unique_labels)
label_list.sort()
return label_list
if isinstance(features[label_column_name].feature, ClassLabel):
label_list = features[label_column_name].feature.names
# No need to convert the labels since they are already ints.
label_to_id = {i: i for i in range(len(label_list))}
else:
label_list = get_label_list(raw_datasets["train"][label_column_name])
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels = len(label_list)
# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
num_labels=num_labels,
label2id=label_to_id,
id2label={i: l for l, i in label_to_id.items()},
finetuning_task=data_args.task_name,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
if config.model_type in {"gpt2", "roberta"}:
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
add_prefix_space=True,
)
else:
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = FlaxAutoModelForTokenClassification.from_pretrained(
model_args.model_name_or_path,
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# Preprocessing the datasets
# Tokenize all texts and align the labels with them.
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(
examples[text_column_name],
max_length=data_args.max_seq_length,
padding="max_length",
truncation=True,
# We use this argument because the texts in our dataset are lists of words (with a label for each word).
is_split_into_words=True,
)
labels = []
for i, label in enumerate(examples[label_column_name]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
# ignored in the loss function.
if word_idx is None:
label_ids.append(-100)
# We set the label for the first token of each word.
elif word_idx != previous_word_idx:
label_ids.append(label_to_id[label[word_idx]])
# For the other tokens in a word, we set the label to either the current label or -100, depending on
# the label_all_tokens flag.
else:
label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
processed_raw_datasets = raw_datasets.map(
tokenize_and_align_labels,
batched=True,
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache,
remove_columns=raw_datasets["train"].column_names,
desc="Running tokenizer on dataset",
)
train_dataset = processed_raw_datasets["train"]
eval_dataset = processed_raw_datasets["validation"]
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# Define a summary writer
summary_writer = tensorboard.SummaryWriter(training_args.output_dir)
summary_writer.hparams({**training_args.to_dict(), **vars(model_args), **vars(data_args)})
def write_train_metric(summary_writer, train_metrics, train_time, step):
summary_writer.scalar("train_time", train_time, step)
train_metrics = get_metrics(train_metrics)
for key, vals in train_metrics.items():
tag = f"train_{key}"
for i, val in enumerate(vals):
summary_writer.scalar(tag, val, step - len(vals) + i + 1)
def write_eval_metric(summary_writer, eval_metrics, step):
for metric_name, value in eval_metrics.items():
summary_writer.scalar(f"eval_{metric_name}", value, step)
num_epochs = int(training_args.num_train_epochs)
rng = jax.random.PRNGKey(training_args.seed)
dropout_rngs = jax.random.split(rng, jax.local_device_count())
train_batch_size = training_args.per_device_train_batch_size * jax.local_device_count()
eval_batch_size = training_args.per_device_eval_batch_size * jax.local_device_count()
learning_rate_fn = create_learning_rate_fn(
len(train_dataset),
train_batch_size,
training_args.num_train_epochs,
training_args.warmup_steps,
training_args.learning_rate,
)
state = create_train_state(model, learning_rate_fn, num_labels=num_labels, training_args=training_args)
# define step functions
def train_step(
state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey
) -> Tuple[train_state.TrainState, float]:
"""Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`."""
dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
targets = batch.pop("labels")
def loss_fn(params):
logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
loss = state.loss_fn(logits, targets)
return loss
grad_fn = jax.value_and_grad(loss_fn)
loss, grad = grad_fn(state.params)
grad = jax.lax.pmean(grad, "batch")
new_state = state.apply_gradients(grads=grad)
metrics = jax.lax.pmean({"loss": loss, "learning_rate": learning_rate_fn(state.step)}, axis_name="batch")
return new_state, metrics, new_dropout_rng
p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0,))
def eval_step(state, batch):
logits = state.apply_fn(**batch, params=state.params, train=False)[0]
return state.logits_fn(logits)
p_eval_step = jax.pmap(eval_step, axis_name="batch")
metric = load_metric("seqeval")
def get_labels(y_pred, y_true):
# Transform predictions and references tensos to numpy arrays
# Remove ignored index (special tokens)
true_predictions = [
[label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
for pred, gold_label in zip(y_pred, y_true)
]
true_labels = [
[label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
for pred, gold_label in zip(y_pred, y_true)
]
return true_predictions, true_labels
def compute_metrics():
results = metric.compute()
if data_args.return_entity_level_metrics:
# Unpack nested dictionaries
final_results = {}
for key, value in results.items():
if isinstance(value, dict):
for n, v in value.items():
final_results[f"{key}_{n}"] = v
else:
final_results[key] = value
return final_results
else:
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
logger.info(f"===== Starting training ({num_epochs} epochs) =====")
train_time = 0
# make sure weights are replicated on each device
state = replicate(state)
train_time = 0
step_per_epoch = len(train_dataset) // train_batch_size
total_steps = step_per_epoch * num_epochs
epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
for epoch in epochs:
train_start = time.time()
train_metrics = []
# Create sampling rng
rng, input_rng = jax.random.split(rng)
# train
for step, batch in enumerate(
tqdm(
train_data_collator(input_rng, train_dataset, train_batch_size),
total=step_per_epoch,
desc="Training...",
position=1,
)
):
state, train_metric, dropout_rngs = p_train_step(state, batch, dropout_rngs)
train_metrics.append(train_metric)
cur_step = epoch * step_per_epoch + step
if cur_step % training_args.logging_steps == 0 and cur_step > 0:
# Save metrics
train_metric = unreplicate(train_metric)
train_time += time.time() - train_start
if jax.process_index() == 0:
write_train_metric(summary_writer, train_metrics, train_time, cur_step)
epochs.write(
f"Step... ({cur_step}/{total_steps} | Training Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
)
train_metrics = []
if cur_step % training_args.eval_steps == 0 and cur_step > 0:
eval_metrics = {}
# evaluate
for batch in tqdm(
eval_data_collator(eval_dataset, eval_batch_size),
total=len(eval_dataset) // eval_batch_size,
desc="Evaluating ...",
position=2,
):
labels = batch.pop("labels")
predictions = p_eval_step(state, batch)
predictions = np.array([pred for pred in chain(*predictions)])
labels = np.array([label for label in chain(*labels)])
labels[np.array(chain(*batch["attention_mask"])) == 0] = -100
preds, refs = get_labels(predictions, labels)
metric.add_batch(
predictions=preds,
references=refs,
)
# evaluate also on leftover examples (not divisible by batch_size)
num_leftover_samples = len(eval_dataset) % eval_batch_size
# make sure leftover batch is evaluated on one device
if num_leftover_samples > 0 and jax.process_index() == 0:
# take leftover samples
batch = eval_dataset[-num_leftover_samples:]
batch = {k: np.array(v) for k, v in batch.items()}
labels = batch.pop("labels")
predictions = eval_step(unreplicate(state), batch)
labels = np.array(labels)
labels[np.array(batch["attention_mask"]) == 0] = -100
preds, refs = get_labels(predictions, labels)
metric.add_batch(
predictions=preds,
references=refs,
)
eval_metrics = compute_metrics()
if data_args.return_entity_level_metrics:
logger.info(f"Step... ({cur_step}/{total_steps} | Validation metrics: {eval_metrics}")
else:
logger.info(
f"Step... ({cur_step}/{total_steps} | Validation f1: {eval_metrics['f1']}, Validation Acc: {eval_metrics['accuracy']})"
)
if jax.process_index() == 0:
write_eval_metric(summary_writer, eval_metrics, cur_step)
if (cur_step % training_args.save_steps == 0 and cur_step > 0) or (cur_step == total_steps):
# save checkpoint after each epoch and push checkpoint to the hub
if jax.process_index() == 0:
params = jax.device_get(unreplicate(state.params))
model.save_pretrained(
training_args.output_dir,
params=params,
push_to_hub=training_args.push_to_hub,
commit_message=f"Saving weights and logs of step {cur_step}",
)
epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}"
if __name__ == "__main__":
main()

View File

@@ -74,17 +74,6 @@ line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume
2. If `resume_from_checkpoint` is a path to a specific checkpoint it will use that saved checkpoint folder to resume the training from.
### Upload the trained/fine-tuned model to the Hub
All the example scripts support automatic upload of your final model to the [Model Hub](https://huggingface.co/models) by adding a `--push_to_hub` argument. It will then create a repository with your username slash the name of the folder you are using as `output_dir`. For instance, `"sgugger/test-mrpc"` if your username is `sgugger` and you are working in the folder `~/tmp/test-mrpc`.
To specify a given repository name, use the `--hub_model_id` argument. You will need to specify the whole repository name (including your username), for instance `--hub_model_id sgugger/finetuned-bert-mrpc`. To upload to an organization you are a member of, just use the name of that organization instead of your username: `--hub_model_id huggingface/finetuned-bert-mrpc`.
A few notes on this integration:
- you will need to be logged in to the Hugging Face website locally for it to work, the easiest way to achieve this is to run `huggingface-cli login` and then type your username and password when prompted. You can also pass along your authentication token with the `--hub_token` argument.
- the `output_dir` you pick will either need to be a new folder or a local clone of the distant repository you are using.
## Distributed training and mixed precision
All the PyTorch scripts mentioned above work out of the box with distributed training and mixed precision, thanks to

View File

@@ -18,5 +18,3 @@ pytest
conllu
sentencepiece != 0.1.92
protobuf
torchvision
jiwer

View File

@@ -1,133 +0,0 @@
<!---
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Image classification examples
The following examples showcase how to fine-tune a `ViT` for image-classification using PyTorch.
## Using datasets from 🤗 `datasets`
Here we show how to fine-tune a `ViT` on the [beans](https://huggingface.co/datasets/beans) dataset.
👀 See the results here: [nateraw/vit-base-beans](https://huggingface.co/nateraw/vit-base-beans).
```bash
python run_image_classification.py \
--dataset_name beans \
--output_dir ./beans_outputs/ \
--remove_unused_columns False \
--do_train \
--do_eval \
--push_to_hub \
--push_to_hub_model_id vit-base-beans \
--learning_rate 2e-5 \
--num_train_epochs 5 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--logging_strategy steps \
--logging_steps 10 \
--evaluation_strategy epoch \
--save_strategy epoch \
--load_best_model_at_end True \
--save_total_limit 3 \
--seed 1337
```
Here we show how to fine-tune a `ViT` on the [cats_vs_dogs](https://huggingface.co/datasets/cats_vs_dogs) dataset.
👀 See the results here: [nateraw/vit-base-cats-vs-dogs](https://huggingface.co/nateraw/vit-base-cats-vs-dogs).
```bash
python run_image_classification.py \
--dataset_name cats_vs_dogs \
--output_dir ./cats_vs_dogs_outputs/ \
--remove_unused_columns False \
--do_train \
--do_eval \
--push_to_hub \
--push_to_hub_model_id vit-base-cats-vs-dogs \
--fp16 True \
--learning_rate 2e-4 \
--num_train_epochs 5 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--logging_strategy steps \
--logging_steps 10 \
--evaluation_strategy epoch \
--save_strategy epoch \
--load_best_model_at_end True \
--save_total_limit 3 \
--seed 1337
```
## Using your own data
To use your own dataset, the training script expects the following directory structure:
```bash
root/dog/xxx.png
root/dog/xxy.png
root/dog/[...]/xxz.png
root/cat/123.png
root/cat/nsdf3.png
root/cat/[...]/asd932_.png
```
Once you've prepared your dataset, you can can run the script like this:
```bash
python run_image_classification.py \
--dataset_name nateraw/image-folder \
--train_dir <path-to-train-root> \
--output_dir ./outputs/ \
--remove_unused_columns False \
--do_train \
--do_eval
```
### 💡 The above will split the train dir into training and evaluation sets
- To control the split amount, use the `--train_val_split` flag.
- To provide your own validation split in its own directory, you can pass the `--validation_dir <path-to-val-root>` flag.
## Sharing your model on 🤗 Hub
0. If you haven't already, [sign up](https://huggingface.co/join) for a 🤗 account
1. Make sure you have `git-lfs` installed and git set up.
```bash
$ apt install git-lfs
$ git config --global user.email "you@example.com"
$ git config --global user.name "Your Name"
```
2. Log in with your HuggingFace account credentials using `huggingface-cli`
```bash
$ huggingface-cli login
# ...follow the prompts
```
3. When running the script, pass the following arguments:
```bash
python run_image_classification.py \
--push_to_hub \
--push_to_hub_model_id <name-your-model> \
...
```

View File

@@ -1,2 +0,0 @@
torch>=1.9.0
torchvision>=0.10.0

View File

@@ -1,360 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
import torch
from datasets import load_dataset
from PIL import Image
from torchvision.transforms import (
CenterCrop,
Compose,
Normalize,
RandomHorizontalFlip,
RandomResizedCrop,
Resize,
ToTensor,
)
import transformers
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoModelForImageClassification,
HfArgumentParser,
Trainer,
TrainingArguments,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
""" Fine-tuning a 🤗 Transformers model for image classification"""
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def pil_loader(path: str):
with open(path, "rb") as f:
im = Image.open(f)
return im.convert("RGB")
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
"""
dataset_name: Optional[str] = field(
default="nateraw/image-folder", metadata={"help": "Name of a dataset from the datasets package"}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."})
validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."})
train_val_split: Optional[float] = field(
default=0.15, metadata={"help": "Percent to split off of train for validation."}
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
image_size: Optional[int] = field(default=224, metadata={"help": " The size (resolution) of each image."})
def __post_init__(self):
data_files = dict()
if self.train_dir is not None:
data_files["train"] = self.train_dir
if self.validation_dir is not None:
data_files["val"] = self.validation_dir
self.data_files = data_files if data_files else None
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
default="google/vit-base-patch16-224-in21k",
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
)
model_type: Optional[str] = field(
default=None,
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
"with private models)."
},
)
def collate_fn(examples):
pixel_values = torch.stack([example["pixel_values"] for example in examples])
labels = torch.tensor([example["labels"] for example in examples])
return {"pixel_values": pixel_values, "labels": labels}
def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Initialize our dataset and prepare it for the 'image-classification' task.
ds = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
data_files=data_args.data_files,
cache_dir=model_args.cache_dir,
task="image-classification",
)
# Define torchvision transforms to be applied to each image.
normalize = Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
_train_transforms = Compose(
[
RandomResizedCrop(data_args.image_size),
RandomHorizontalFlip(),
ToTensor(),
normalize,
]
)
_val_transforms = Compose(
[
Resize(data_args.image_size),
CenterCrop(data_args.image_size),
ToTensor(),
normalize,
]
)
def train_transforms(example_batch):
"""Apply _train_transforms across a batch."""
example_batch["pixel_values"] = [_train_transforms(pil_loader(f)) for f in example_batch["image_file_path"]]
return example_batch
def val_transforms(example_batch):
"""Apply _val_transforms across a batch."""
example_batch["pixel_values"] = [_val_transforms(pil_loader(f)) for f in example_batch["image_file_path"]]
return example_batch
# If we don't have a validation split, split off a percentage of train as validation.
data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split
if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
split = ds["train"].train_test_split(data_args.train_val_split)
ds["train"] = split["train"]
ds["validation"] = split["test"]
# Prepare label mappings.
# We'll include these in the model's config to get human readable labels in the Inference API.
labels = ds["train"].features["labels"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
label2id[label] = str(i)
id2label[str(i)] = label
# Load the accuracy metric from the datasets package
metric = datasets.load_metric("accuracy")
# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p):
"""Computes accuracy on a batch of predictions"""
return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
config = AutoConfig.from_pretrained(
model_args.config_name or model_args.model_name_or_path,
num_labels=len(labels),
label2id=label2id,
id2label=id2label,
finetuning_task="image-classification",
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForImageClassification.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# NOTE - We aren't directly using this feature extractor since we defined custom transforms above.
# We initialize this instance below and pass it to Trainer to ensure that the feature extraction
# config, preprocessor_config.json, is included in output directories.
# This way if we push a model to the hub, the inference widget will work.
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
size=data_args.image_size,
image_mean=normalize.mean,
image_std=normalize.std,
)
if training_args.do_train:
if "train" not in ds:
raise ValueError("--do_train requires a train dataset")
if data_args.max_train_samples is not None:
ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
# Set the training transforms
ds["train"].set_transform(train_transforms)
if training_args.do_eval:
if "validation" not in ds:
raise ValueError("--do_eval requires a validation dataset")
if data_args.max_eval_samples is not None:
ds["validation"] = (
ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
)
# Set the validation transforms
ds["validation"].set_transform(val_transforms)
# Initalize our trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=ds["train"] if training_args.do_train else None,
eval_dataset=ds["validation"] if training_args.do_eval else None,
compute_metrics=compute_metrics,
tokenizer=feature_extractor,
data_collator=collate_fn,
)
# Training
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
# Evaluation
if training_args.do_eval:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
# Write model card and (optionally) push to hub
kwargs = {
"finetuned_from": model_args.model_name_or_path,
"tasks": "image-classification",
"dataset": data_args.dataset_name,
"tags": ["image-classification"],
}
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
if __name__ == "__main__":
main()

View File

@@ -174,3 +174,8 @@ python run_clm.py --model_type gpt2 --tokenizer_name gpt2 \ --config_overrides="
```
This feature is only available in `run_clm.py`, `run_plm.py` and `run_mlm.py`.
This feature can also be used to activate gradient checkpointing by passing:
```
--config_overrides "gradient_checkpointing=true,use_cache=False"
```

View File

@@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
@@ -500,19 +500,17 @@ def main():
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):

View File

@@ -27,7 +27,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
import torch
@@ -37,7 +36,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator, DistributedType
from huggingface_hub import Repository
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
@@ -50,7 +48,6 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name
from transformers.utils.versions import require_version
@@ -179,11 +176,7 @@ def parse_args():
parser.add_argument(
"--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files."
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -197,8 +190,8 @@ def parse_args():
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -230,18 +223,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -499,22 +480,10 @@ def main():
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
@@ -528,19 +528,17 @@ def main():
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):

View File

@@ -27,7 +27,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
import torch
@@ -37,7 +36,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator, DistributedType
from huggingface_hub import Repository
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
@@ -50,7 +48,6 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name
from transformers.utils.versions import require_version
@@ -188,11 +185,7 @@ def parse_args():
parser.add_argument(
"--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -206,8 +199,8 @@ def parse_args():
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -239,18 +232,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -537,22 +518,10 @@ def main():
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
@@ -499,19 +499,17 @@ def main():
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "language-modeling"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "language-modeling"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):

View File

@@ -47,7 +47,7 @@ from transformers.utils import check_min_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
logger = logging.getLogger(__name__)
@@ -430,19 +430,15 @@ def main():
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
kwargs = dict(
finetuned_from=model_args.model_name_or_path,
tasks="multiple-choice",
dataset_tags="swag",
dataset_args="regular",
dataset="SWAG",
language="en",
)
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
trainer.push_to_hub(
finetuned_from=model_args.model_name_or_path,
tasks="multiple-choice",
dataset_tags="swag",
dataset_args="regular",
dataset="SWAG",
language="en",
)
def _mp_fn(index):

View File

@@ -24,7 +24,6 @@ import math
import os
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union
import datasets
@@ -35,7 +34,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from huggingface_hub import Repository
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
@@ -49,7 +47,7 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import PaddingStrategy, get_full_repo_name
from transformers.file_utils import PaddingStrategy
logger = logging.getLogger(__name__)
@@ -171,15 +169,9 @@ def parse_args():
action="store_true",
help="Activate debug mode and run training only with a subset of data.",
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -268,18 +260,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -498,22 +478,10 @@ def main():
eval_metric = metric.compute()
accelerator.print(f"epoch {epoch}: {eval_metric}")
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -48,7 +48,7 @@ from utils_qa import postprocess_qa_predictions
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
@@ -623,19 +623,17 @@ def main():
trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):

View File

@@ -47,7 +47,7 @@ from utils_qa import postprocess_qa_predictions_with_beam_search
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
@@ -656,19 +656,17 @@ def main():
trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):

View File

@@ -23,7 +23,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
import numpy as np
@@ -34,7 +33,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from huggingface_hub import Repository
from transformers import (
AdamW,
DataCollatorWithPadding,
@@ -47,14 +45,13 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from utils_qa import postprocess_qa_predictions_with_beam_search
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
@@ -206,11 +203,7 @@ def parse_args():
default=None,
help="For debugging purposes or quicker training, truncate the number of prediction examples to this",
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -232,8 +225,8 @@ def parse_args():
extension = args.test_file.split(".")[-1]
assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -265,18 +258,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -722,15 +703,8 @@ def main():
if completed_steps >= args.max_train_steps:
break
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
# intialize all lists to collect the batches
# intialize all lists to collect the batches
all_start_top_log_probs = []
all_start_top_index = []
all_end_top_log_probs = []
@@ -847,10 +821,6 @@ def main():
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -23,7 +23,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
import numpy as np
@@ -34,7 +33,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from huggingface_hub import Repository
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
@@ -49,14 +47,13 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
from utils_qa import postprocess_qa_predictions
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
@@ -235,11 +232,7 @@ def parse_args():
help="Model type to use if training from scratch.",
choices=MODEL_TYPES,
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -261,8 +254,8 @@ def parse_args():
extension = args.test_file.split(".")[-1]
assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -294,18 +287,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -727,14 +708,6 @@ def main():
if completed_steps >= args.max_train_steps:
break
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
# Evaluation
logger.info("***** Running Evaluation *****")
logger.info(f" Num examples = {len(eval_dataset)}")
@@ -809,10 +782,6 @@ def main():
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -1,129 +0,0 @@
<!---
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Automatic Speech Recognition examples
## Connectionist Temporal Classification without Language Model (CTC w/o LM)
The script [`run_speech_recognition_ctc.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py) can be used to fine-tune any pretrained [Connectionist Temporal Classification Model](https://huggingface.co/transformers/master/model_doc/auto.html?highlight=automodelforctc#automodelforctc) for automatic speech
recognition on one of the [official speech recognition datasets](https://huggingface.co/datasets?task_ids=task_ids:automatic-speech-recognition) or a custom dataset.
Speech recognition models that have been pretrained in unsupervised fashion on audio data alone, *e.g.* [Wav2Vec2](https://huggingface.co/transformers/master/model_doc/wav2vec2.html), [HuBERT](https://huggingface.co/transformers/master/model_doc/hubert.html), [XLSR-Wav2Vec2](https://huggingface.co/transformers/master/model_doc/xlsr_wav2vec2.html), have shown to require only
very little annotated data to yield good performance on automatic speech recognition datasets.
In the script [`run_speech_recognition_ctc`], we first create a vocabulary from all unique characters of both the training data and evaluation data. Then, we preprocesses the speech recognition dataset, which includes correct resampling, normalization and padding. Finally, the pretrained speech recognition model is fine-tuned on the annotated speech recognition datasets using CTC loss.
---
**NOTE**
If you encounter problems with data preprocessing by setting `--preprocessing_num_workers` > 1,
you might want to set the environment variable `OMP_NUM_THREADS` to 1 as follows:
```bash
OMP_NUM_THREADS=1 python run_speech_recognition_ctc ...
```
If the environment variable is not set, the training script might freeze, *i.e.* see: https://github.com/pytorch/audio/issues/1021#issuecomment-726915239
---
### Single-GPU
The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface.co/transformers/master/model_doc/xlsr_wav2vec2.html) on [Common Voice](https://huggingface.co/datasets/common_voice) using a single GPU in half-precision.
```bash
python run_speech_recognition_ctc.py \
--dataset_name="common_voice" \
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
--dataset_config_name="tr" \
--output_dir="./wav2vec2-common_voice-tr-demo" \
--overwrite_output_dir \
--num_train_epochs="15" \
--per_device_train_batch_size="16" \
--gradient_accumulation_steps="2" \
--learning_rate="3e-4" \
--warmup_steps="500" \
--evaluation_strategy="steps" \
--audio_column_name="path" \
--text_column_name="sentence" \
--save_steps="400" \
--eval_steps="100" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % <20> \
--fp16 \
--group_by_length \
--push_to_hub \
--do_train --do_eval
```
On a single V100 GPU, this script should run in *ca.* 1 hour 20 minutes and yield a CTC loss of **0.39** and word error rate
of **0.35**.
### Multi-GPU
The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface.co/transformers/master/model_doc/xlsr_wav2vec2.html) on [Common Voice](https://huggingface.co/datasets/common_voice) using 8 GPUs in half-precision.
```bash
python -m torch.distributed.launch \
--nproc_per_node 8 run_speech_recognition_ctc.py \
--dataset_name="common_voice" \
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
--dataset_config_name="tr" \
--output_dir="./wav2vec2-common_voice-tr-demo-dist" \
--preprocessing_num_workers="16" \
--overwrite_output_dir \
--num_train_epochs="15" \
--per_device_train_batch_size="4" \
--learning_rate="3e-4" \
--warmup_steps="500" \
--evaluation_strategy="steps" \
--audio_column_name="path" \
--text_column_name="sentence" \
--save_steps="400" \
--eval_steps="100" \
--logging_steps="1" \
--layerdrop="0.0" \
--save_total_limit="3" \
--freeze_feature_extractor \
--gradient_checkpointing \
--chars_to_ignore , ? . ! - \; \: \" “ % <20> \
--fp16 \
--group_by_length \
--push_to_hub \
--do_train --do_eval
```
On 8 V100 GPUs, this script should run in *ca.* 18 minutes and yield a CTC loss of **0.39** and word error rate
of **0.36**.
### Examples
In the following a couple of demonstration fine-tuning runs are listed.
It has been verified that the script works for the following datasets:
- [Common Voice](https://huggingface.co/datasets/common_voice)
- [Librispeech](https://huggingface.co/datasets/librispeech_asr)
| Dataset | Dataset Config | Pretrained Model | Word error rate on eval | GPU setup | Training time | Fine-tuned Model & Logs |
|-------|------------------------------|-------------|---------------|---------------|----------------------|-------------|
| [Librispeech](https://huggingface.co/datasets/librispeech_asr)| `"clean"` - `"train.100"` | [facebook/wav2vec2-large-lv60](https://huggingface.co/facebook/wav2vec2-large-lv60) | 0.042 | 8 GPU V100 | 1h30min | [here](https://huggingface.co/patrickvonplaten/wav2vec2-librispeech-clean-100h-demo-dist) |
| [Librispeech](https://huggingface.co/datasets/librispeech_asr)| `"clean"` - `"train.100"` | [facebook/hubert-large-ll60k](https://huggingface.co/facebook/hubert-large-ll60k) | 0.088 | 8 GPU V100 | 1h30min | [here](https://huggingface.co/patrickvonplaten/hubert-librispeech-clean-100h-demo-dist) |
| [Common Voice](https://huggingface.co/datasets/common_voice)| `"tr"` | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) | 0.36 | 8 GPU V100 | 18min | [here](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-demo-dist) |
| [Common Voice](https://huggingface.co/datasets/common_voice)| `"tr"` | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) | 0.35 | 1 GPU V100 | 1h20min | [here](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-demo) |

View File

@@ -1,3 +0,0 @@
datasets >= 1.12.0
torch >= 1.5
torchaudio

View File

@@ -1,613 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
import functools
import json
import logging
import os
import re
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Union
import numpy as np
import torch
import torchaudio
from datasets import DatasetDict, load_dataset, load_metric
import transformers
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForCTC,
AutoTokenizer,
HfArgumentParser,
Trainer,
TrainingArguments,
Wav2Vec2Processor,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
# TODO(Patrick) Bump up as soon as audio features are merged
require_version("datasets>=1.12.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
logger = logging.getLogger(__name__)
def list_field(default=None, metadata=None):
return field(default_factory=lambda: default, metadata=metadata)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
)
freeze_feature_extractor: Optional[bool] = field(
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
)
attention_dropout: Optional[float] = field(
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
)
activation_dropout: Optional[float] = field(
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
)
feat_proj_dropout: Optional[float] = field(
default=0.0, metadata={"help": "The dropout ratio for the projected features."}
)
hidden_dropout: Optional[float] = field(
default=0.0,
metadata={
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
},
)
final_dropout: Optional[float] = field(
default=0.0,
metadata={"help": "The dropout probability for the final projection layer."},
)
mask_time_prob: Optional[float] = field(
default=0.05,
metadata={
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
"span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
"vectors will be masked along the time axis. This is only relevant if ``apply_spec_augment is True``."
},
)
layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
ctc_loss_reduction: Optional[str] = field(
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
"""
dataset_name: str = field(
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_split_name: Optional[str] = field(
default="train+validation",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
eval_split_name: Optional[str] = field(
default="test",
metadata={
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
},
)
audio_column_name: Optional[str] = field(
default="audio",
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
)
text_column_name: Optional[str] = field(
default="text",
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
)
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
},
)
chars_to_ignore: Optional[List[str]] = list_field(
default=None,
metadata={"help": "A list of characters to remove from the transcripts."},
)
max_duration_in_seconds: Optional[float] = field(
default=20.0,
metadata={
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
},
)
min_duration_in_seconds: Optional[float] = field(
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
)
only_data_preprocessing: Optional[bool] = field(
default=False,
metadata={
"help": "Whether to only do data preprocessing and skip training. "
"This is especially useful when data preprocessing errors out in distributed training due to timeout. "
"In this case, one should run the preprocessing in a non-distributed setup with `only_data_preprocessing=True` "
"so that the cached datasets can consequently be loaded in distributed training"
},
)
@dataclass
class DataCollatorCTCWithPadding:
"""
Data collator that will dynamically pad the inputs received.
Args:
processor (:class:`~transformers.Wav2Vec2Processor`)
The processor used for proccessing the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
max_length_labels (:obj:`int`, `optional`):
Maximum length of the ``labels`` returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
"""
processor: Wav2Vec2Processor
padding: Union[bool, str] = "longest"
pad_to_multiple_of: Optional[int] = None
pad_to_multiple_of_labels: Optional[int] = None
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lenghts and need
# different padding methods
input_features = [{"input_values": feature["input_values"]} for feature in features]
label_features = [{"input_ids": feature["labels"]} for feature in features]
batch = self.processor.pad(
input_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
with self.processor.as_target_processor():
labels_batch = self.processor.pad(
label_features,
padding=self.padding,
pad_to_multiple_of=self.pad_to_multiple_of_labels,
return_tensors="pt",
)
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
batch["labels"] = labels
return batch
def create_vocabulary_from_data(datasets: DatasetDict):
# Given training and test labels create vocabulary
def extract_all_chars(batch):
all_text = " ".join(batch["target_text"])
vocab = list(set(all_text))
return {"vocab": [vocab], "all_text": [all_text]}
vocabs = datasets.map(
extract_all_chars,
batched=True,
batch_size=-1,
keep_in_memory=True,
remove_columns=datasets["train"].column_names,
)
# take union of all unique characters in each dataset
vocab_set = functools.reduce(
lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"][0]), vocabs.values()
)
vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}
# replace white space with delimiter token
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
# add unk and pad token
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
return vocab_dict
def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank):
transformers.utils.logging.set_verbosity_info()
logger.info("Training/evaluation parameters %s", training_args)
# Set seed before initializing model.
set_seed(training_args.seed)
# 1. First, let's load the dataset
raw_datasets = DatasetDict()
raw_datasets["train"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
)
raw_datasets["eval"] = load_dataset(
data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name
)
if data_args.audio_column_name not in raw_datasets["train"].column_names:
raise ValueError(
f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--audio_column_name` to the correct audio column - one of "
f"{', '.join(raw_datasets['train'].column_names)}."
)
if data_args.text_column_name not in raw_datasets["train"].column_names:
raise ValueError(
f"--text_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. "
"Make sure to set `--text_column_name` to the correct text column - one of "
f"{', '.join(raw_datasets['train'].column_names)}."
)
# prepare dataset
if data_args.max_train_samples is not None:
raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
if data_args.max_eval_samples is not None:
raw_datasets["eval"] = raw_datasets["eval"].select(range(data_args.max_eval_samples))
# 2. We remove some special characters from the datasets
# that make training complicated and do not help in transcribing the speech
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
# that could be easily picked up by the model
chars_to_ignore_regex = (
f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None
)
def remove_special_characters(batch):
if chars_to_ignore_regex is not None:
batch["target_text"] = re.sub(chars_to_ignore_regex, "", batch[data_args.text_column_name]).lower() + " "
else:
batch["target_text"] = batch[data_args.text_column_name].lower() + " "
return batch
with training_args.main_process_first(desc="dataset map special characters removal"):
raw_datasets = raw_datasets.map(
remove_special_characters,
remove_columns=[data_args.text_column_name],
desc="remove special characters from datasets",
)
# 3. Next, we create the vocabulary of the model by extracting all unique characters from
# the training and evaluation datasets
# We need to make sure that only first rank saves vocabulary
# make sure all processes wait until vocab is created
with training_args.main_process_first(desc="dataset map vocabulary creation"):
vocab_dict = create_vocabulary_from_data(raw_datasets)
vocab_file = os.path.join(training_args.output_dir, "vocab.json")
# save vocab dict to be loaded into tokenizer
os.makedirs(training_args.output_dir, exist_ok=True)
if training_args.overwrite_output_dir and os.path.isfile(vocab_file):
os.remove(vocab_file)
if not os.path.isfile(vocab_file):
with open(vocab_file, "w") as vocab_file:
json.dump(vocab_dict, vocab_file)
# 4. Now we can instantiate the configuration, feature extractor, tokenizer and model
# Note for distributed training, the .from_pretrained methods guarantee that only
# one local process can concurrently download model & vocab.
# load config
config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
# load feature_extractor, tokenizer and create processor
tokenizer = AutoTokenizer.from_pretrained(
training_args.output_dir,
tokenizer_type=config.model_type,
unk_token="[UNK]",
pad_token="[PAD]",
word_delimiter_token="|",
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir
)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
# adapt config
config.update(
{
"feat_proj_dropout": model_args.feat_proj_dropout,
"attention_dropout": model_args.attention_dropout,
"hidden_dropout": model_args.hidden_dropout,
"final_dropout": model_args.final_dropout,
"mask_time_prob": model_args.mask_time_prob,
"gradient_checkpointing": training_args.gradient_checkpointing,
"layerdrop": model_args.layerdrop,
"ctc_loss_reduction": model_args.ctc_loss_reduction,
"pad_token_id": processor.tokenizer.pad_token_id,
"vocab_size": len(processor.tokenizer),
"activation_dropout": model_args.activation_dropout,
}
)
# create model
model = AutoModelForCTC.from_pretrained(
model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config
)
# freeze encoder
if model_args.freeze_feature_extractor:
model.freeze_feature_extractor()
# 5. Now we preprocess the datasets which includes loading the audio, resampling and padding
# The following code should be cleaned up as soon as
# https://github.com/huggingface/datasets/pull/2324 is merged
# Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
# derive max & min input length for sample rate & max duration
max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate
min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate
resampler = None
if raw_datasets["train"][data_args.audio_column_name][0].split(".")[-1] == "mp3":
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
resampler = torchaudio.transforms.Resample(48_000, processor.feature_extractor.sampling_rate)
# Preprocessing the datasets.
# We need to read the audio files as arrays and tokenize the targets.
def prepare_dataset(batch):
# load audio
speech_array, sampling_rate = torchaudio.load(batch[data_args.audio_column_name])
speech_array = speech_array.squeeze()
# if necessary resample audio
if resampler is not None:
# TODO(PVP) - remove hard-coded 48_000 after audio feature is merged
speech_array = resampler(speech_array)
sampling_rate = resampler.new_freq
speech_array = speech_array.numpy()
batch["input_values"] = processor(
speech_array, sampling_rate=sampling_rate, truncate=True, max_length=max_input_length
).input_values[0]
# Setup the processor for targets
with processor.as_target_processor():
batch["labels"] = processor(batch["target_text"]).input_ids
return batch
with training_args.main_process_first(desc="dataset map preprocessing"):
vectorized_datasets = raw_datasets.map(
prepare_dataset,
remove_columns=raw_datasets["train"].column_names,
num_proc=data_args.preprocessing_num_workers,
desc="preprocess datasets",
)
if min_input_length > 0.0:
# filter data that is shorter than min_input_length
vectorized_datasets = vectorized_datasets.filter(
lambda data: len(data["input_values"]) > min_input_length,
num_proc=data_args.preprocessing_num_workers,
)
# 6. Next, we can prepare the training.
# Let's use word error rate (WER) as our evaluation metric,
# instantiate a data collator and the trainer
# Define Metric during training
wer_metric = load_metric("wer")
if data_args.only_data_preprocessing:
logger.info("Data preprocessing finished.")
return
def compute_metrics(pred):
pred_logits = pred.predictions
pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
pred_str = processor.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
wer = wer_metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
# Instantiate custom data collator
data_collator = DataCollatorCTCWithPadding(processor=processor)
# Initialize Trainer
trainer = Trainer(
model=model,
data_collator=data_collator,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=vectorized_datasets["train"] if training_args.do_train else None,
eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None,
tokenizer=processor.feature_extractor,
)
# 7. Finally, we can start training
# Training
if training_args.do_train:
# use last checkpoint if exist
if last_checkpoint is not None:
checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
# Save the feature_extractor and the tokenizer
if is_main_process(training_args.local_rank):
processor.save_pretrained(training_args.output_dir)
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()
metrics = train_result.metrics
max_train_samples = (
data_args.max_train_samples
if data_args.max_train_samples is not None
else len(vectorized_datasets["train"])
)
metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"]))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
max_eval_samples = (
data_args.max_eval_samples if data_args.max_eval_samples is not None else len(vectorized_datasets["eval"])
)
metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"]))
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
# Write model card and (optionally) push to hub
kwargs = {
"finetuned_from": model_args.model_name_or_path,
"tasks": "speech-recognition",
"tags": ["automatic-speech-recognition", data_args.dataset_name],
"dataset_args": f"Config: {data_args.dataset_config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}",
"dataset": f"{data_args.dataset_name.upper()} - {data_args.dataset_config_name.upper()}",
}
if "common_voice" in data_args.dataset_name:
kwargs["language"] = data_args.dataset_config_name
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
return results
if __name__ == "__main__":
main()

View File

@@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
@@ -99,13 +99,6 @@ class ModelArguments:
"with private models)."
},
)
resize_position_embeddings: Optional[bool] = field(
default=None,
metadata={
"help": "Whether to automatically resize the position embeddings if `max_source_length` exceeds "
"the model's position embeddings."
},
)
@dataclass
@@ -373,25 +366,6 @@ def main():
if model.config.decoder_start_token_id is None:
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
if (
hasattr(model.config, "max_position_embeddings")
and model.config.max_position_embeddings < data_args.max_source_length
):
if model_args.resize_position_embeddings is None:
logger.warning(
f"Increasing the model's number of position embedding vectors from {model.config.max_position_embeddings} "
f"to {data_args.max_source_length}."
)
model.resize_position_embeddings(data_args.max_source_length)
elif model_args.resize_position_embeddings:
model.resize_position_embeddings(data_args.max_source_length)
else:
raise ValueError(
f"`--max_source_length` is set to {data_args.max_source_length}, but the model only has {model.config.max_position_embeddings}"
f" position encodings. Consider either reducing `--max_source_length` to {model.config.max_position_embeddings} or to automatically "
"resize the model's position encodings by passing `--resize_position_embeddings`."
)
prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
# Preprocessing the datasets.
@@ -622,19 +596,17 @@ def main():
with open(output_prediction_file, "w") as writer:
writer.write("\n".join(predictions))
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "summarization"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
return results

View File

@@ -23,7 +23,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
import nltk
@@ -36,7 +35,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from filelock import FileLock
from huggingface_hub import Repository
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
@@ -49,7 +47,7 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name, is_offline_mode
from transformers.file_utils import is_offline_mode
from transformers.utils.versions import require_version
@@ -257,11 +255,7 @@ def parse_args():
help="Model type to use if training from scratch.",
choices=MODEL_TYPES,
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -275,8 +269,8 @@ def parse_args():
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -319,18 +313,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -594,22 +576,10 @@ def main():
logger.info(result)
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -38,8 +38,6 @@ SRC_DIRS = [
"question-answering",
"summarization",
"translation",
"image-classification",
"speech-recognition",
]
]
sys.path.extend(SRC_DIRS)
@@ -49,11 +47,9 @@ if SRC_DIRS is not None:
import run_clm
import run_generation
import run_glue
import run_image_classification
import run_mlm
import run_ner
import run_qa as run_squad
import run_speech_recognition_ctc
import run_summarization
import run_swag
import run_translation
@@ -344,69 +340,3 @@ class ExamplesTests(TestCasePlus):
run_translation.main()
result = get_results(tmp_dir)
self.assertGreaterEqual(result["eval_bleu"], 30)
def test_run_image_classification(self):
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_image_classification.py
--output_dir {tmp_dir}
--model_name_or_path google/vit-base-patch16-224-in21k
--dataset_name hf-internal-testing/cats_vs_dogs_sample
--do_train
--do_eval
--learning_rate 1e-4
--per_device_train_batch_size 2
--per_device_eval_batch_size 1
--remove_unused_columns False
--overwrite_output_dir True
--dataloader_num_workers 16
--metric_for_best_model accuracy
--max_steps 10
--train_val_split 0.1
--seed 42
""".split()
if is_cuda_and_apex_available():
testargs.append("--fp16")
with patch.object(sys, "argv", testargs):
run_image_classification.main()
result = get_results(tmp_dir)
self.assertGreaterEqual(result["eval_accuracy"], 0.8)
def test_run_speech_recognition_ctc(self):
stream_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(stream_handler)
tmp_dir = self.get_auto_remove_tmp_dir()
testargs = f"""
run_speech_recognition_ctc.py
--output_dir {tmp_dir}
--model_name_or_path hf-internal-testing/tiny-random-wav2vec2
--dataset_name patrickvonplaten/librispeech_asr_dummy
--dataset_config_name clean
--train_split_name validation
--eval_split_name validation
--audio_column_name file
--do_train
--do_eval
--learning_rate 1e-4
--per_device_train_batch_size 2
--per_device_eval_batch_size 1
--remove_unused_columns False
--overwrite_output_dir True
--preprocessing_num_workers 16
--max_steps 10
--seed 42
""".split()
if is_cuda_and_apex_available():
testargs.append("--fp16")
with patch.object(sys, "argv", testargs):
run_speech_recognition_ctc.main()
result = get_results(tmp_dir)
self.assertLess(result["eval_loss"], result["train_loss"])

View File

@@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
@@ -546,17 +546,15 @@ def main():
item = label_list[item]
writer.write(f"{index}\t{item}\n")
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
if data_args.task_name is not None:
kwargs["language"] = "en"
kwargs["dataset_tags"] = "glue"
kwargs["dataset_args"] = data_args.task_name
kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
if data_args.task_name is not None:
kwargs["language"] = "en"
kwargs["dataset_tags"] = "glue"
kwargs["dataset_args"] = data_args.task_name
kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):

View File

@@ -18,7 +18,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
from datasets import load_dataset, load_metric
@@ -27,7 +26,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from huggingface_hub import Repository
from transformers import (
AdamW,
AutoConfig,
@@ -40,7 +38,6 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name
from transformers.utils.versions import require_version
@@ -145,11 +142,6 @@ def parse_args():
)
parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -163,8 +155,8 @@ def parse_args():
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -196,18 +188,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
# or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
@@ -446,22 +426,10 @@ def main():
eval_metric = metric.compute()
logger.info(f"epoch {epoch}: {eval_metric}")
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if args.task_name == "mnli":
# Final evaluation on mismatched validation set

View File

@@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

View File

@@ -47,7 +47,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
@@ -542,19 +542,17 @@ def main():
for prediction in true_predictions:
writer.write(" ".join(prediction) + "\n")
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "token-classification"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
def _mp_fn(index):

View File

@@ -23,7 +23,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
import torch
@@ -33,7 +32,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from huggingface_hub import Repository
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
@@ -47,7 +45,6 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name
from transformers.utils.versions import require_version
@@ -198,11 +195,6 @@ def parse_args():
action="store_true",
help="Activate debug mode and run training only with a subset of data.",
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -216,8 +208,8 @@ def parse_args():
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -249,18 +241,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -572,22 +552,10 @@ def main():
eval_metric = compute_metrics()
accelerator.print(f"epoch {epoch}:", eval_metric)
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -42,7 +42,7 @@ and you also will find examples of these below.
Here is an example of a translation fine-tuning with a MarianMT model:
```bash
python examples/pytorch/translation/run_translation.py \
python examples/pytorch/seq2seq/run_translation.py \
--model_name_or_path Helsinki-NLP/opus-mt-en-ro \
--do_train \
--do_eval \
@@ -62,7 +62,7 @@ MBart and some T5 models require special handling.
T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:
```bash
python examples/pytorch/translation/run_translation.py \
python examples/pytorch/seq2seq/run_translation.py \
--model_name_or_path t5-small \
--do_train \
--do_eval \
@@ -85,7 +85,7 @@ For the aforementioned group of T5 models it's important to remember that if you
MBart models require a different format for `--source_lang` and `--target_lang` values, e.g. instead of `en` it expects `en_XX`, for `ro` it expects `ro_RO`. The full MBart specification for language codes can be found [here](https://huggingface.co/facebook/mbart-large-cc25). For example:
```bash
python examples/pytorch/translation/run_translation.py \
python examples/pytorch/seq2seq/run_translation.py \
--model_name_or_path facebook/mbart-large-en-ro \
--do_train \
--do_eval \
@@ -104,7 +104,7 @@ And here is how you would use the translation finetuning on your own files, afte
values for the arguments `--train_file`, `--validation_file` to match your setup:
```bash
python examples/pytorch/translation/run_translation.py \
python examples/pytorch/seq2seq/run_translation.py \
--model_name_or_path t5-small \
--do_train \
--do_eval \
@@ -133,7 +133,7 @@ Here the languages are Romanian (`ro`) and English (`en`).
If you want to use a pre-processed dataset that leads to high BLEU scores, but for the `en-de` language pair, you can use `--dataset_name stas/wmt14-en-de-pre-processed`, as following:
```bash
python examples/pytorch/translation/run_translation.py \
python examples/pytorch/seq2seq/run_translation.py \
--model_name_or_path t5-small \
--do_train \
--do_eval \

View File

@@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
@@ -590,23 +590,21 @@ def main():
with open(output_prediction_file, "w", encoding="utf-8") as writer:
writer.write("\n".join(predictions))
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None]
if len(languages) > 0:
kwargs["language"] = languages
if training_args.push_to_hub:
kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "translation"}
if data_args.dataset_name is not None:
kwargs["dataset_tags"] = data_args.dataset_name
if data_args.dataset_config_name is not None:
kwargs["dataset_args"] = data_args.dataset_config_name
kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
else:
kwargs["dataset"] = data_args.dataset_name
languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None]
if len(languages) > 0:
kwargs["language"] = languages
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
return results

View File

@@ -23,7 +23,6 @@ import logging
import math
import os
import random
from pathlib import Path
import datasets
import numpy as np
@@ -34,7 +33,6 @@ from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from huggingface_hub import Repository
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
@@ -50,7 +48,6 @@ from transformers import (
get_scheduler,
set_seed,
)
from transformers.file_utils import get_full_repo_name
from transformers.utils.versions import require_version
@@ -238,11 +235,7 @@ def parse_args():
help="Model type to use if training from scratch.",
choices=MODEL_TYPES,
)
parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
parser.add_argument(
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
)
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
args = parser.parse_args()
# Sanity checks
@@ -257,9 +250,8 @@ def parse_args():
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if args.push_to_hub:
assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@@ -292,18 +284,6 @@ def main():
if args.seed is not None:
set_seed(args.seed)
# Handle the repository creation
if accelerator.is_main_process:
if args.push_to_hub:
if args.hub_model_id is None:
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
else:
repo_name = args.hub_model_id
repo = Repository(args.output_dir, clone_from=repo_name)
elif args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
accelerator.wait_for_everyone()
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
@@ -573,22 +553,10 @@ def main():
eval_metric = metric.compute()
logger.info({"bleu": eval_metric["score"]})
if args.push_to_hub and epoch < args.num_train_epochs - 1:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(args.output_dir)
if args.push_to_hub:
repo.push_to_hub(commit_message="End of training")
if __name__ == "__main__":

View File

@@ -1,83 +0,0 @@
# FSNER
Implemented by [sayef](https://huggingface.co/sayef).
## Overview
The FSNER model was proposed in [Example-Based Named Entity Recognition](https://arxiv.org/abs/2008.10570) by Morteza Ziyadi, Yuting Sun, Abhishek Goswami, Jade Huang, Weizhu Chen. To identify entity spans in a new domain, it uses a train-free few-shot learning approach inspired by question-answering.
## Abstract
----
> We present a novel approach to named entity recognition (NER) in the presence of scarce data that we call example-based NER. Our train-free few-shot learning approach takes inspiration from question-answering to identify entity spans in a new and unseen domain. In comparison with the current state-of-the-art, the proposed method performs significantly better, especially when using a low number of support examples.
## Model Training Details
-----
| identifier | epochs | datasets |
| ---------- |:----------:| :-----:|
| [sayef/fsner-bert-base-uncased](https://huggingface.co/sayef/fsner-bert-base-uncased) | 10 | ontonotes5, conll2003, wnut2017, and fin (Alvarado et al.). |
## Installation and Example Usage
------
You can use the FSNER model in two ways:
1. Install as a package: `python setup.py install` and import the model as shown in the code example below
or
2. Change directory to `src` and import the model as shown in the code example below
```python
from fsner import FSNERModel, FSNERTokenizerUtils
model = FSNERModel("sayef/fsner-bert-base-uncased")
tokenizer = FSNERTokenizerUtils("sayef/fsner-bert-base-uncased")
# size of query and supports must be the same. If you want to find all the entitites in one particular query, just repeat the same query n times where n is equal to the number of supports (or entities).
query = [
'KWE 4000 can reach with a maximum speed from up to 450 P/min an accuracy from 50 mg',
'I would like to order a computer from eBay.',
]
# each list in supports are the examples of one entity type
# wrap entities around with [E] and [/E] in the examples
supports = [
[
'Horizontal flow wrapper [E] Pack 403 [/E] features the new retrofit-kit „paper-ON-form“',
'[E] Paloma Pick-and-Place-Roboter [/E] arranges the bakery products for the downstream tray-forming equipment',
'Finally, the new [E] Kliklok ACE [/E] carton former forms cartons and trays without the use of glue',
'We set up our pilot plant with the right [E] FibreForm® [/E] configuration to make prototypes for your marketing tests and package validation',
'The [E] CAR-T5 [/E] is a reliable, purely mechanically driven cartoning machine for versatile application fields'
],
[
"[E] Walmart [/E] is a leading e-commerce company",
"I recently ordered a book from [E] Amazon [/E]",
"I ordered this from [E] ShopClues [/E]",
"Fridge can be ordered in [E] Amazon [/E]",
"[E] Flipkart [/E] started it's journey from zero"
]
]
device = 'cpu'
W_query = tokenizer.tokenize(query).to(device)
W_supports = tokenizer.tokenize([s for support in supports for s in support]).to(device)
start_prob, end_prob = model(W_query, W_supports)
output = tokenizer.extract_entity_from_scores(query, W_query, start_prob, end_prob, thresh=0.50)
print(output)
```

View File

@@ -1,7 +0,0 @@
[build-system]
requires = [
"setuptools>=57.4.0",
"wheel>=0.37.0",
"transformers>=4.9.2"
]
build-backend = "setuptools.build_meta"

View File

@@ -1 +0,0 @@
transformers>=4.9.2

View File

@@ -1,27 +0,0 @@
import setuptools
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
setuptools.setup(
name="fsner",
version="0.0.1",
author="msi sayef",
author_email="msi.sayef@gmail.com",
description="Few-shot Named Entity Recognition",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/huggingface/transformers/tree/master/examples/research_projects/fsner",
project_urls={
"Bug Tracker": "https://github.com/huggingface/transformers/issues",
},
classifiers=[
"Programming Language :: Python :: 3",
"Operating System :: OS Independent",
],
package_dir={"": "src"},
packages=setuptools.find_packages(where="src"),
python_requires=">=3.6",
install_requires=["torch>=1.9.0", "transformers>=4.9.2"],
)

View File

@@ -1,5 +0,0 @@
from .model import FSNERModel
from .tokenizer_utils import FSNERTokenizerUtils
__all__ = ["FSNERModel", "FSNERTokenizerUtils"]

View File

@@ -1,61 +0,0 @@
import torch
from transformers import AutoModel
class FSNERModel(torch.nn.Module):
"""
The FSNER model implements a few-shot named entity recognition method from the paper `Example-Based Named Entity Recognition <https://arxiv.org/abs/2008.10570>`__ by
Morteza Ziyadi, Yuting Sun, Abhishek Goswami, Jade Huang, Weizhu Chen. To identify entity spans in a new domain, it
uses a train-free few-shot learning approach inspired by question-answering.
"""
def __init__(self, pretrained_model_name_or_path="sayef/fsner-bert-base-uncased"):
super(FSNERModel, self).__init__()
self.bert = AutoModel.from_pretrained(pretrained_model_name_or_path, return_dict=True)
self.cos = torch.nn.CosineSimilarity(3, 1e-08)
self.softmax = torch.nn.Softmax(dim=1)
def BERT(self, **inputs):
return self.bert(**inputs).last_hidden_state
def VectorSum(self, token_embeddings):
return token_embeddings.sum(2, keepdim=True)
def Atten(self, q_rep, S_rep, T=1):
return self.softmax(T * self.cos(q_rep, S_rep))
def forward(self, W_query, W_supports):
"""
Find scores of each token being start and end token for an entity.
Args:
W_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of query sequence tokens in the vocabulary.
W_supports (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of support sequence tokens in the vocabulary.
Returns:
p_start (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Scores of each token as
being start token of an entity
p_end (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Scores of each token as
being end token of an entity
"""
q = self.BERT(**W_query)
S = self.BERT(**W_supports)
# reshape from (batch_size, 384, 784) to (batch_size, 1, 384, 784)
q = q.view(q.shape[0], -1, q.shape[1], q.shape[2])
# reshape from (batch_size*n_exaples_per_entity, 384, 784) to (batch_size, n_exaples_per_entity, 384, 784)
S = S.view(q.shape[0], -1, S.shape[1], S.shape[2])
s_start = S[(W_supports["input_ids"] == 30522).view(S.shape[:3])].view(S.shape[0], -1, 1, S.shape[-1])
s_end = S[(W_supports["input_ids"] == 30523).view(S.shape[:3])].view(S.shape[0], -1, 1, S.shape[-1])
p_start = torch.sum(torch.einsum("bitf,bejf->bet", q, s_start), dim=1)
p_end = torch.sum(torch.einsum("bitf,bejf->bet", q, s_end), dim=1)
p_start = p_start.softmax(dim=1)
p_end = p_end.softmax(dim=1)
return p_start, p_end

View File

@@ -1,49 +0,0 @@
from transformers import AutoTokenizer
class FSNERTokenizerUtils(object):
def __init__(self, pretrained_model_name_or_path):
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
def tokenize(self, x):
return self.tokenizer(x, padding="max_length", max_length=384, truncation=True, return_tensors="pt")
def extract_entity_from_scores(self, query, W_query, p_start, p_end, thresh=0.70):
"""
Extracts entities from query and scores given a threshold.
Args:
query (`List[str]`):
List of query strings.
W_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of query sequence tokens in the vocabulary.
p_start (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Scores of each token as being start token of an entity
p_end (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
Scores of each token as being end token of an entity
thresh (`float`):
Score threshold value
Returns:
A list of lists of tuples(decoded entity, score)
"""
final_outputs = []
for idx in range(len(W_query["input_ids"])):
start_indexes = end_indexes = range(p_start.shape[1])
output = []
for start_id in start_indexes:
for end_id in end_indexes:
if start_id < end_id:
output.append((start_id, end_id, p_start[idx][start_id].item(), p_end[idx][end_id].item()))
output.sort(key=lambda tup: (tup[2] * tup[3]), reverse=True)
temp = []
for k in range(len(output)):
if output[k][2] * output[k][3] >= thresh:
c_start_pos, c_end_pos = output[k][0], output[k][1]
decoded = self.tokenizer.decode(W_query["input_ids"][idx][c_start_pos:c_end_pos])
temp.append((decoded, output[k][2] * output[k][3]))
final_outputs.append(temp)
return final_outputs

View File

@@ -224,9 +224,8 @@ class ImageTextDataset(VisionDataset):
self.image_paths = []
for example in examples:
captions_subset = example["captions"][:captions_per_image]
self.captions.extend(captions_subset)
self.image_paths.extend([example["image_path"]] * len(captions_subset))
self.captions.extend(example["captions"][:captions_per_image])
self.image_paths.extend([example["image_path"]] * captions_per_image)
def _load_image(self, idx: int):
path = self.image_paths[idx]
@@ -374,9 +373,7 @@ def main():
def collate_fn(examples):
pixel_values = torch.stack([example[0] for example in examples]).permute(0, 2, 3, 1).numpy()
captions = [example[1] for example in examples]
inputs = tokenizer(
captions, max_length=data_args.max_seq_length, padding="max_length", truncation=True, return_tensors="np"
)
inputs = tokenizer(captions, max_length=data_args.max_seq_length, padding="max_length", return_tensors="np")
batch = {
"pixel_values": pixel_values,

View File

@@ -46,10 +46,10 @@
"ATTR_URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt\"\n",
"GQA_URL = \"https://raw.githubusercontent.com/airsplay/lxmert/master/data/gqa/trainval_label2ans.json\"\n",
"VQA_URL = \"https://raw.githubusercontent.com/airsplay/lxmert/master/data/vqa/trainval_label2ans.json\"\n",
"\n",
" \n",
"\n",
"# for visualizing output\n",
"def showarray(a, fmt=\"jpeg\"):\n",
"def showarray(a, fmt='jpeg'):\n",
" a = np.uint8(np.clip(a, 0, 255))\n",
" f = io.BytesIO()\n",
" PIL.Image.fromarray(a).save(f, fmt)\n",
@@ -118,17 +118,17 @@
}
],
"source": [
"# image viz\n",
"#image viz\n",
"frcnn_visualizer = SingleImageViz(URL, id2obj=objids, id2attr=attrids)\n",
"# run frcnn\n",
"images, sizes, scales_yx = image_preprocess(URL)\n",
"output_dict = frcnn(\n",
" images,\n",
" sizes,\n",
" scales_yx=scales_yx,\n",
" images, \n",
" sizes, \n",
" scales_yx=scales_yx, \n",
" padding=\"max_detections\",\n",
" max_detections=frcnn_cfg.max_detections,\n",
" return_tensors=\"pt\",\n",
" return_tensors=\"pt\"\n",
")\n",
"# add boxes and labels to the image\n",
"\n",
@@ -174,7 +174,7 @@
" \"Where is this scene?\",\n",
" \"what is the man riding?\",\n",
" \"What is the man wearing?\",\n",
" \"What is the color of the horse?\",\n",
" \"What is the color of the horse?\"\n",
"]\n",
"test_questions_for_url2 = [\n",
" \"Where is the cat?\",\n",
@@ -184,7 +184,7 @@
" \"What is the shape of the monitor?\",\n",
"]\n",
"\n",
"# Very important that the boxes are normalized\n",
"#Very important that the boxes are normalized\n",
"normalized_boxes = output_dict.get(\"normalized_boxes\")\n",
"features = output_dict.get(\"roi_features\")\n",
"\n",
@@ -200,7 +200,7 @@
" return_token_type_ids=True,\n",
" return_attention_mask=True,\n",
" add_special_tokens=True,\n",
" return_tensors=\"pt\",\n",
" return_tensors=\"pt\"\n",
" )\n",
"\n",
" # run lxmert(s)\n",

View File

@@ -60,39 +60,33 @@ You could run the following:
```bash
export TRAIN_FILE=/path/to/train/file
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
export LTP_RESOURCE=/path/to/ltp/tokenizer
export BERT_RESOURCE=/path/to/bert/tokenizer
export SAVE_PATH=/path/to/data/ref.txt
python run_chinese_ref.py \
--file_name=$TRAIN_FILE \
--ltp=$LTP_RESOURCE \
--bert=$BERT_RESOURCE \
--save_path=$SAVE_PATH
--file_name=path_to_train_or_eval_file \
--ltp=path_to_ltp_tokenizer \
--bert=path_to_bert_tokenizer \
--save_path=path_to_reference_file
```
Then you can run the script like this:
```bash
export TRAIN_FILE=/path/to/train/file
export VALIDATION_FILE=/path/to/validation/file
export TRAIN_REF_FILE=/path/to/train/chinese_ref/file
export VALIDATION_REF_FILE=/path/to/validation/chinese_ref/file
export OUTPUT_DIR=/tmp/test-mlm-wwm
python run_mlm_wwm.py \
--model_name_or_path roberta-base \
--train_file $TRAIN_FILE \
--validation_file $VALIDATION_FILE \
--train_ref_file $TRAIN_REF_FILE \
--validation_ref_file $VALIDATION_REF_FILE \
--train_file path_to_train_file \
--validation_file path_to_validation_file \
--train_ref_file path_to_train_chinese_ref_file \
--validation_ref_file path_to_validation_chinese_ref_file \
--do_train \
--do_eval \
--output_dir $OUTPUT_DIR
--output_dir /tmp/test-mlm-wwm
```
**Note1:** On TPU, you should the flag `--pad_to_max_length` to make sure all your batches have the same length.
**Note2:** And if you have any questions or something goes wrong when runing this code, don't hesitate to pin @wlhgtc.
**Note2:** And if you have any questions or something goes wrong when runing this code, don't hesitate to pin @wlhgtc.

View File

@@ -44,7 +44,7 @@
"\n",
"from transformers import *\n",
"\n",
"os.chdir(\"../../\")"
"os.chdir('../../')"
]
},
{
@@ -70,15 +70,15 @@
"# Load fine-pruned model and quantize the model\n",
"\n",
"model = BertForQuestionAnswering.from_pretrained(\"huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad\")\n",
"model.to(\"cpu\")\n",
"model.to('cpu')\n",
"\n",
"quantized_model = torch.quantization.quantize_dynamic(\n",
" model=model,\n",
" qconfig_spec={\n",
" nn.Linear: torch.quantization.default_dynamic_qconfig,\n",
" },\n",
" dtype=torch.qint8,\n",
")\n",
" model=model,\n",
" qconfig_spec = {\n",
" nn.Linear : torch.quantization.default_dynamic_qconfig,\n",
" },\n",
" dtype=torch.qint8,\n",
" )\n",
"# print(quantized_model)\n",
"\n",
"qtz_st = quantized_model.state_dict()"
@@ -92,14 +92,10 @@
"source": [
"# Saving the original (encoder + classifier) in the standard torch.save format\n",
"\n",
"dense_st = {\n",
" name: param for name, param in model.state_dict().items() if \"embedding\" not in name and \"pooler\" not in name\n",
"}\n",
"torch.save(\n",
" dense_st,\n",
" \"dbg/dense_squad.pt\",\n",
")\n",
"dense_mb_size = os.path.getsize(\"dbg/dense_squad.pt\")"
"dense_st = {name: param for name, param in model.state_dict().items() \n",
" if \"embedding\" not in name and \"pooler\" not in name}\n",
"torch.save(dense_st, 'dbg/dense_squad.pt',)\n",
"dense_mb_size = os.path.getsize(\"dbg/dense_squad.pt\")\n"
]
},
{
@@ -202,23 +198,23 @@
" if \"dtype\" not in name and param.is_quantized:\n",
" print(\"Decompose quantization for\", name)\n",
" # We need to extract the scale, the zero_point and the int_repr for the quantized tensor and modules\n",
" scale = param.q_scale() # torch.tensor(1,) - float32\n",
" zero_point = param.q_zero_point() # torch.tensor(1,) - int32\n",
" scale = param.q_scale() # torch.tensor(1,) - float32\n",
" zero_point = param.q_zero_point() # torch.tensor(1,) - int32\n",
" elementary_qtz_st[f\"{name}.scale\"] = scale\n",
" elementary_qtz_st[f\"{name}.zero_point\"] = zero_point\n",
"\n",
" # We assume the int_repr is sparse and compute its CSR representation\n",
" # Only the FCs in the encoder are actually sparse\n",
" int_repr = param.int_repr() # torch.tensor(nb_rows, nb_columns) - int8\n",
" int_repr_cs = sparse.csr_matrix(int_repr) # scipy.sparse.csr.csr_matrix\n",
" int_repr = param.int_repr() # torch.tensor(nb_rows, nb_columns) - int8\n",
" int_repr_cs = sparse.csr_matrix(int_repr) # scipy.sparse.csr.csr_matrix\n",
"\n",
" elementary_qtz_st[f\"{name}.int_repr.data\"] = int_repr_cs.data # np.array int8\n",
" elementary_qtz_st[f\"{name}.int_repr.indptr\"] = int_repr_cs.indptr # np.array int32\n",
" assert max(int_repr_cs.indices) < 65535 # If not, we shall fall back to int32\n",
" elementary_qtz_st[f\"{name}.int_repr.indices\"] = np.uint16(int_repr_cs.indices) # np.array uint16\n",
" elementary_qtz_st[f\"{name}.int_repr.shape\"] = int_repr_cs.shape # tuple(int, int)\n",
" elementary_qtz_st[f\"{name}.int_repr.data\"] = int_repr_cs.data # np.array int8\n",
" elementary_qtz_st[f\"{name}.int_repr.indptr\"] = int_repr_cs.indptr # np.array int32\n",
" assert max(int_repr_cs.indices) < 65535 # If not, we shall fall back to int32\n",
" elementary_qtz_st[f\"{name}.int_repr.indices\"] = np.uint16(int_repr_cs.indices) # np.array uint16\n",
" elementary_qtz_st[f\"{name}.int_repr.shape\"] = int_repr_cs.shape # tuple(int, int)\n",
" else:\n",
" elementary_qtz_st[name] = param"
" elementary_qtz_st[name] = param\n"
]
},
{
@@ -229,7 +225,7 @@
"source": [
"# Create mapping from torch.dtype to string description (we could also used an int8 instead of string)\n",
"str_2_dtype = {\"qint8\": torch.qint8}\n",
"dtype_2_str = {torch.qint8: \"qint8\"}"
"dtype_2_str = {torch.qint8: \"qint8\"}\n"
]
},
{
@@ -250,17 +246,11 @@
"source": [
"# Saving the pruned (encoder + classifier) in the standard torch.save format\n",
"\n",
"dense_optimized_st = {\n",
" name: param for name, param in elementary_qtz_st.items() if \"embedding\" not in name and \"pooler\" not in name\n",
"}\n",
"torch.save(\n",
" dense_optimized_st,\n",
" \"dbg/dense_squad_optimized.pt\",\n",
")\n",
"print(\n",
" \"Encoder Size (MB) - Sparse & Quantized - `torch.save`:\",\n",
" round(os.path.getsize(\"dbg/dense_squad_optimized.pt\") / 1e6, 2),\n",
")"
"dense_optimized_st = {name: param for name, param in elementary_qtz_st.items() \n",
" if \"embedding\" not in name and \"pooler\" not in name}\n",
"torch.save(dense_optimized_st, 'dbg/dense_squad_optimized.pt',)\n",
"print(\"Encoder Size (MB) - Sparse & Quantized - `torch.save`:\",\n",
" round(os.path.getsize(\"dbg/dense_squad_optimized.pt\")/1e6, 2))\n"
]
},
{
@@ -297,7 +287,7 @@
"# Save the decomposed state_dict with an HDF5 file\n",
"# Saving only the encoder + QA Head\n",
"\n",
"with h5py.File(\"dbg/squad_sparse.h5\", \"w\") as hf:\n",
"with h5py.File('dbg/squad_sparse.h5','w') as hf:\n",
" for name, param in elementary_qtz_st.items():\n",
" if \"embedding\" in name:\n",
" print(f\"Skip {name}\")\n",
@@ -328,18 +318,18 @@
" elif type(param) == torch.dtype:\n",
" # dtype - tensor _packed_params.dtype\n",
" hf.attrs[name] = dtype_2_str[param]\n",
"\n",
" \n",
" else:\n",
" hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
"\n",
"\n",
"with open(\"dbg/metadata.json\", \"w\") as f:\n",
" f.write(json.dumps(qtz_st._metadata))\n",
"with open('dbg/metadata.json', 'w') as f:\n",
" f.write(json.dumps(qtz_st._metadata)) \n",
"\n",
"size = os.path.getsize(\"dbg/squad_sparse.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
"print(\"\")\n",
"print(\"Encoder Size (MB) - Dense: \", round(dense_mb_size / 1e6, 2))\n",
"print(\"Encoder Size (MB) - Sparse & Quantized:\", round(size / 1e6, 2))"
"print(\"Encoder Size (MB) - Dense: \", round(dense_mb_size/1e6, 2))\n",
"print(\"Encoder Size (MB) - Sparse & Quantized:\", round(size/1e6, 2))\n"
]
},
{
@@ -360,15 +350,15 @@
"# Save the decomposed state_dict to HDF5 storage\n",
"# Save everything in the architecutre (embedding + encoder + QA Head)\n",
"\n",
"with h5py.File(\"dbg/squad_sparse_with_embs.h5\", \"w\") as hf:\n",
"with h5py.File('dbg/squad_sparse_with_embs.h5','w') as hf:\n",
" for name, param in elementary_qtz_st.items():\n",
" # if \"embedding\" in name:\n",
" # print(f\"Skip {name}\")\n",
" # continue\n",
"# if \"embedding\" in name:\n",
"# print(f\"Skip {name}\")\n",
"# continue\n",
"\n",
" # if \"pooler\" in name:\n",
" # print(f\"Skip {name}\")\n",
" # continue\n",
"# if \"pooler\" in name:\n",
"# print(f\"Skip {name}\")\n",
"# continue\n",
"\n",
" if type(param) == torch.Tensor:\n",
" if param.numel() == 1:\n",
@@ -391,16 +381,17 @@
" elif type(param) == torch.dtype:\n",
" # dtype - tensor _packed_params.dtype\n",
" hf.attrs[name] = dtype_2_str[param]\n",
"\n",
" \n",
" else:\n",
" hf.create_dataset(name, data=param, compression=\"gzip\", compression_opts=9)\n",
"\n",
"\n",
"with open(\"dbg/metadata.json\", \"w\") as f:\n",
" f.write(json.dumps(qtz_st._metadata))\n",
"\n",
"with open('dbg/metadata.json', 'w') as f:\n",
" f.write(json.dumps(qtz_st._metadata)) \n",
"\n",
"size = os.path.getsize(\"dbg/squad_sparse_with_embs.h5\") + os.path.getsize(\"dbg/metadata.json\")\n",
"print(\"\\nSize (MB):\", round(size / 1e6, 2))"
"print('\\nSize (MB):', round(size/1e6, 2))\n"
]
},
{
@@ -420,10 +411,10 @@
"\n",
"reconstructed_elementary_qtz_st = {}\n",
"\n",
"hf = h5py.File(\"dbg/squad_sparse_with_embs.h5\", \"r\")\n",
"hf = h5py.File('dbg/squad_sparse_with_embs.h5','r')\n",
"\n",
"for attr_name, attr_param in hf.attrs.items():\n",
" if \"shape\" in attr_name:\n",
" if 'shape' in attr_name:\n",
" attr_param = tuple(attr_param)\n",
" elif \".scale\" in attr_name:\n",
" if \"_packed_params\" in attr_name:\n",
@@ -439,20 +430,20 @@
" attr_param = str_2_dtype[attr_param]\n",
" reconstructed_elementary_qtz_st[attr_name] = attr_param\n",
" # print(f\"Unpack {attr_name}\")\n",
"\n",
" \n",
"# Get the tensors/arrays\n",
"for data_name, data_param in hf.items():\n",
" if \"LayerNorm\" in data_name or \"_packed_params.bias\" in data_name:\n",
" reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
" elif \"embedding\" in data_name:\n",
" reconstructed_elementary_qtz_st[data_name] = torch.from_numpy(np.array(data_param))\n",
" else: # _packed_params.weight.int_repr.data, _packed_params.weight.int_repr.indices and _packed_params.weight.int_repr.indptr\n",
" else: # _packed_params.weight.int_repr.data, _packed_params.weight.int_repr.indices and _packed_params.weight.int_repr.indptr\n",
" data_param = np.array(data_param)\n",
" if \"indices\" in data_name:\n",
" data_param = np.array(data_param, dtype=np.int32)\n",
" reconstructed_elementary_qtz_st[data_name] = data_param\n",
" # print(f\"Unpack {data_name}\")\n",
"\n",
" \n",
"\n",
"hf.close()"
]
@@ -493,29 +484,27 @@
"for name, param in reconstructed_elementary_qtz_st.items():\n",
" if \"weight.int_repr.indptr\" in name:\n",
" prefix_ = name[:-16]\n",
" data = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.data\"]\n",
" indptr = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indptr\"]\n",
" data = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.data\"]\n",
" indptr = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indptr\"]\n",
" indices = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.indices\"]\n",
" shape = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.shape\"]\n",
" shape = reconstructed_elementary_qtz_st[f\"{prefix_}.int_repr.shape\"]\n",
"\n",
" int_repr = sparse.csr_matrix(arg1=(data, indices, indptr), shape=shape)\n",
" int_repr = sparse.csr_matrix(arg1=(data, indices, indptr),\n",
" shape=shape)\n",
" int_repr = torch.tensor(int_repr.todense())\n",
"\n",
" scale = reconstructed_elementary_qtz_st[f\"{prefix_}.scale\"]\n",
" zero_point = reconstructed_elementary_qtz_st[f\"{prefix_}.zero_point\"]\n",
" weight = torch._make_per_tensor_quantized_tensor(int_repr, scale, zero_point)\n",
" weight = torch._make_per_tensor_quantized_tensor(int_repr,\n",
" scale,\n",
" zero_point)\n",
"\n",
" reconstructed_qtz_st[f\"{prefix_}\"] = weight\n",
" elif (\n",
" \"int_repr.data\" in name\n",
" or \"int_repr.shape\" in name\n",
" or \"int_repr.indices\" in name\n",
" or \"weight.scale\" in name\n",
" or \"weight.zero_point\" in name\n",
" ):\n",
" elif \"int_repr.data\" in name or \"int_repr.shape\" in name or \"int_repr.indices\" in name or \\\n",
" \"weight.scale\" in name or \"weight.zero_point\" in name:\n",
" continue\n",
" else:\n",
" reconstructed_qtz_st[name] = param"
" reconstructed_qtz_st[name] = param\n"
]
},
{
@@ -567,17 +556,17 @@
"source": [
"# Load the re-constructed state dict into a model\n",
"\n",
"dummy_model = BertForQuestionAnswering.from_pretrained(\"bert-base-uncased\")\n",
"dummy_model.to(\"cpu\")\n",
"dummy_model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')\n",
"dummy_model.to('cpu')\n",
"\n",
"reconstructed_qtz_model = torch.quantization.quantize_dynamic(\n",
" model=dummy_model,\n",
" qconfig_spec=None,\n",
" dtype=torch.qint8,\n",
")\n",
" model=dummy_model,\n",
" qconfig_spec = None,\n",
" dtype=torch.qint8,\n",
" )\n",
"\n",
"reconstructed_qtz_st = OrderedDict(reconstructed_qtz_st)\n",
"with open(\"dbg/metadata.json\", \"r\") as read_file:\n",
"with open('dbg/metadata.json', 'r') as read_file:\n",
" metadata = json.loads(read_file.read())\n",
"reconstructed_qtz_st._metadata = metadata\n",
"\n",
@@ -607,8 +596,8 @@
" mask = torch.ones(size=(N, 128))\n",
"\n",
" y_reconstructed = reconstructed_qtz_model(input_ids=inputs, attention_mask=mask)[0]\n",
" y = quantized_model(input_ids=inputs, attention_mask=mask)[0]\n",
"\n",
" y = quantized_model(input_ids=inputs, attention_mask=mask)[0]\n",
" \n",
" assert torch.all(torch.eq(y, y_reconstructed))\n",
"print(\"Sanity check passed\")"
]

View File

@@ -37,10 +37,10 @@
"OBJ_URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt\"\n",
"ATTR_URL = \"https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt\"\n",
"VQA_URL = \"https://dl.fbaipublicfiles.com/pythia/data/answers_vqa.txt\"\n",
"\n",
" \n",
"\n",
"# for visualizing output\n",
"def showarray(a, fmt=\"jpeg\"):\n",
"def showarray(a, fmt='jpeg'):\n",
" a = np.uint8(np.clip(a, 0, 255))\n",
" f = io.BytesIO()\n",
" PIL.Image.fromarray(a).save(f, fmt)\n",
@@ -82,7 +82,7 @@
"image_preprocess = Preprocess(frcnn_cfg)\n",
"\n",
"bert_tokenizer = BertTokenizerFast.from_pretrained(\"bert-base-uncased\")\n",
"visualbert_vqa = VisualBertForQuestionAnswering.from_pretrained(\"uclanlp/visualbert-vqa\")"
"visualbert_vqa = VisualBertForQuestionAnswering.from_pretrained(\"uclanlp/visualbert-vqa\")\n"
],
"outputs": [
{
@@ -104,17 +104,17 @@
"cell_type": "code",
"execution_count": 5,
"source": [
"# image viz\n",
"#image viz\n",
"frcnn_visualizer = SingleImageViz(URL, id2obj=objids, id2attr=attrids)\n",
"# run frcnn\n",
"images, sizes, scales_yx = image_preprocess(URL)\n",
"output_dict = frcnn(\n",
" images,\n",
" sizes,\n",
" scales_yx=scales_yx,\n",
" images, \n",
" sizes, \n",
" scales_yx=scales_yx, \n",
" padding=\"max_detections\",\n",
" max_detections=frcnn_cfg.max_detections,\n",
" return_tensors=\"pt\",\n",
" return_tensors=\"pt\"\n",
")\n",
"# add boxes and labels to the image\n",
"\n",
@@ -167,7 +167,7 @@
" \"What is the shape of the monitor?\",\n",
"]\n",
"\n",
"# Very important that the boxes are normalized\n",
"#Very important that the boxes are normalized\n",
"# normalized_boxes = output_dict.get(\"normalized_boxes\")\n",
"features = output_dict.get(\"roi_features\")"
],
@@ -189,7 +189,7 @@
" return_token_type_ids=True,\n",
" return_attention_mask=True,\n",
" add_special_tokens=True,\n",
" return_tensors=\"pt\",\n",
" return_tensors=\"pt\"\n",
" )\n",
"\n",
" output_vqa = visualbert_vqa(\n",

View File

@@ -46,7 +46,7 @@ from transformers.utils import check_min_version
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
logger = logging.getLogger(__name__)

View File

@@ -45,7 +45,7 @@ from utils_qa import postprocess_qa_predictions
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
logger = logging.getLogger(__name__)

View File

@@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
# region Checking dependencies
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@@ -100,7 +100,7 @@ class SavePretrainedCallback(tf.keras.callbacks.Callback):
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
task_to_keys = {
"cola": ("sentence", None),

View File

@@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
# region Dependencies and constants
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
check_min_version("4.10.0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

View File

@@ -90,7 +90,7 @@ _deps = [
"cookiecutter==1.7.2",
"dataclasses",
"datasets",
"deepspeed>=0.5.3",
"deepspeed>=0.5.1",
"docutils==0.16.0",
"fairscale>0.3",
"faiss-cpu",
@@ -100,7 +100,7 @@ _deps = [
"flax>=0.3.4",
"fugashi>=1.0",
"GitPython<3.1.19",
"huggingface-hub>=0.0.17",
"huggingface-hub>=0.0.12",
"importlib_metadata",
"ipadic>=1.0.0,<2.0",
"isort>=5.5.4",
@@ -115,7 +115,7 @@ _deps = [
"onnxruntime>=1.4.0",
"optuna",
"optax>=0.0.8",
"packaging>=20.0",
"packaging",
"parameterized",
"protobuf",
"psutil",
@@ -134,13 +134,12 @@ _deps = [
"sacremoses",
"sagemaker>=2.31.0",
"scikit-learn",
"sentencepiece>=0.1.91,!=0.1.92",
"sigopt",
"sentencepiece==0.1.91",
"soundfile",
"sphinx-copybutton",
"sphinx-markdown-tables",
"sphinx-rtd-theme==0.4.3", # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
"sphinx==3.2.1",
"sphinx==3.5.4",
"sphinxext-opengraph==0.4.1",
"sphinx-intl",
"starlette",
@@ -249,9 +248,8 @@ extras["deepspeed"] = deps_list("deepspeed")
extras["fairscale"] = deps_list("fairscale")
extras["optuna"] = deps_list("optuna")
extras["ray"] = deps_list("ray[tune]")
extras["sigopt"] = deps_list("sigopt")
extras["integrations"] = extras["optuna"] + extras["ray"]+ extras["sigopt"]
extras["integrations"] = extras["optuna"] + extras["ray"]
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
extras["audio"] = deps_list("soundfile")
@@ -344,7 +342,7 @@ install_requires = [
setup(
name="transformers",
version="4.11.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.10.2", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Suraj Patil, Stas Bekman, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
author_email="thomas@huggingface.co",
description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
@@ -371,8 +369,6 @@ setup(
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
cmdclass={"deps_table_update": DepsTableUpdateCommand},

View File

@@ -22,7 +22,7 @@
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
# in the namespace without actually importing anything (and especially none of the backends).
__version__ = "4.11.3"
__version__ = "4.10.2"
# Work around to update TensorFlow's absl.logging threshold which alters the
# default Python logging output behavior when present.
@@ -130,7 +130,6 @@ _import_structure = {
"is_optuna_available",
"is_ray_available",
"is_ray_tune_available",
"is_sigopt_available",
"is_tensorboard_available",
"is_wandb_available",
],
@@ -210,12 +209,10 @@ _import_structure = {
"models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"],
"models.encoder_decoder": ["EncoderDecoderConfig"],
"models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
"models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig", "FNetTokenizer"],
"models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"],
"models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"],
"models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"],
"models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"],
"models.gptj": ["GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTJConfig"],
"models.herbert": ["HerbertTokenizer"],
"models.hubert": ["HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "HubertConfig"],
"models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
@@ -241,7 +238,7 @@ _import_structure = {
"models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
"models.mt5": ["MT5Config"],
"models.openai": ["OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenAIGPTConfig", "OpenAIGPTTokenizer"],
"models.pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig", "PegasusTokenizer"],
"models.pegasus": ["PegasusConfig"],
"models.phobert": ["PhobertTokenizer"],
"models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"],
"models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
@@ -250,17 +247,10 @@ _import_structure = {
"models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
"models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
"models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
"models.speech_encoder_decoder": ["SpeechEncoderDecoderConfig"],
"models.speech_to_text": [
"SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Speech2TextConfig",
],
"models.speech_to_text_2": [
"SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
"Speech2Text2Config",
"Speech2Text2Processor",
"Speech2Text2Tokenizer",
],
"models.splinter": ["SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SplinterConfig", "SplinterTokenizer"],
"models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
"models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
@@ -286,7 +276,6 @@ _import_structure = {
"models.xlm_roberta": ["XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMRobertaConfig"],
"models.xlnet": ["XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLNetConfig"],
"pipelines": [
"AudioClassificationPipeline",
"AutomaticSpeechRecognitionPipeline",
"Conversation",
"ConversationalPipeline",
@@ -296,7 +285,6 @@ _import_structure = {
"ImageClassificationPipeline",
"JsonPipelineDataFormat",
"NerPipeline",
"ObjectDetectionPipeline",
"PipedPipelineDataFormat",
"Pipeline",
"PipelineDataFormat",
@@ -367,11 +355,9 @@ else:
# tokenizers-backed objects
if is_tokenizers_available():
# Fast tokenizers
_import_structure["models.fnet"].append("FNetTokenizerFast")
_import_structure["models.roformer"].append("RoFormerTokenizerFast")
_import_structure["models.clip"].append("CLIPTokenizerFast")
_import_structure["models.convbert"].append("ConvBertTokenizerFast")
_import_structure["models.blenderbot_small"].append("BlenderbotSmallTokenizerFast")
_import_structure["models.albert"].append("AlbertTokenizerFast")
_import_structure["models.bart"].append("BartTokenizerFast")
_import_structure["models.barthez"].append("BarthezTokenizerFast")
@@ -521,7 +507,6 @@ if is_torch_available():
"StoppingCriteriaList",
]
_import_structure["generation_utils"] = ["top_k_top_p_filtering"]
_import_structure["modeling_outputs"] = []
_import_structure["modeling_utils"] = ["Conv1D", "PreTrainedModel", "apply_chunking_to_forward", "prune_layer"]
# PyTorch models structure
@@ -541,7 +526,6 @@ if is_torch_available():
)
_import_structure["models.auto"].extend(
[
"MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING",
"MODEL_FOR_CAUSAL_LM_MAPPING",
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"MODEL_FOR_MASKED_LM_MAPPING",
@@ -557,19 +541,15 @@ if is_torch_available():
"MODEL_MAPPING",
"MODEL_WITH_LM_HEAD_MAPPING",
"AutoModel",
"AutoModelForAudioClassification",
"AutoModelForCausalLM",
"AutoModelForCTC",
"AutoModelForImageClassification",
"AutoModelForMaskedLM",
"AutoModelForMultipleChoice",
"AutoModelForNextSentencePrediction",
"AutoModelForObjectDetection",
"AutoModelForPreTraining",
"AutoModelForQuestionAnswering",
"AutoModelForSeq2SeqLM",
"AutoModelForSequenceClassification",
"AutoModelForSpeechSeq2Seq",
"AutoModelForTableQuestionAnswering",
"AutoModelForTokenClassification",
"AutoModelWithLMHead",
@@ -773,7 +753,6 @@ if is_torch_available():
"DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST",
"DPRContextEncoder",
"DPRPretrainedContextEncoder",
"DPRPreTrainedModel",
"DPRPretrainedQuestionEncoder",
"DPRPretrainedReader",
"DPRQuestionEncoder",
@@ -807,21 +786,6 @@ if is_torch_available():
"FlaubertWithLMHeadModel",
]
)
_import_structure["models.fnet"].extend(
[
"FNET_PRETRAINED_MODEL_ARCHIVE_LIST",
"FNetForMaskedLM",
"FNetForMultipleChoice",
"FNetForNextSentencePrediction",
"FNetForPreTraining",
"FNetForQuestionAnswering",
"FNetForSequenceClassification",
"FNetForTokenClassification",
"FNetLayer",
"FNetModel",
"FNetPreTrainedModel",
]
)
_import_structure["models.fsmt"].extend(["FSMTForConditionalGeneration", "FSMTModel", "PretrainedFSMTModel"])
_import_structure["models.funnel"].extend(
[
@@ -860,15 +824,6 @@ if is_torch_available():
"load_tf_weights_in_gpt_neo",
]
)
_import_structure["models.gptj"].extend(
[
"GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST",
"GPTJForCausalLM",
"GPTJForSequenceClassification",
"GPTJModel",
"GPTJPreTrainedModel",
]
)
_import_structure["models.hubert"].extend(
[
"HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1106,7 +1061,6 @@ if is_torch_available():
"load_tf_weights_in_roformer",
]
)
_import_structure["models.speech_encoder_decoder"].extend(["SpeechEncoderDecoderModel"])
_import_structure["models.speech_to_text"].extend(
[
"SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1115,7 +1069,6 @@ if is_torch_available():
"Speech2TextPreTrainedModel",
]
)
_import_structure["models.speech_to_text_2"].extend(["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"])
_import_structure["models.splinter"].extend(
[
"SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1156,7 +1109,6 @@ if is_torch_available():
"TapasForSequenceClassification",
"TapasModel",
"TapasPreTrainedModel",
"load_tf_weights_in_tapas",
]
)
_import_structure["models.transfo_xl"].extend(
@@ -1275,7 +1227,6 @@ if is_tf_available():
_import_structure["benchmark.benchmark_args_tf"] = ["TensorFlowBenchmarkArguments"]
_import_structure["benchmark.benchmark_tf"] = ["TensorFlowBenchmark"]
_import_structure["generation_tf_utils"] = ["tf_top_k_top_p_filtering"]
_import_structure["modeling_tf_outputs"] = []
_import_structure["modeling_tf_utils"] = [
"TFPreTrainedModel",
"TFSequenceSummary",
@@ -1702,8 +1653,6 @@ if is_flax_available():
"FlaxTopKLogitsWarper",
"FlaxTopPLogitsWarper",
]
_import_structure["generation_flax_utils"] = []
_import_structure["modeling_flax_outputs"] = []
_import_structure["modeling_flax_utils"] = ["FlaxPreTrainedModel"]
_import_structure["models.albert"].extend(
[
@@ -1743,8 +1692,6 @@ if is_flax_available():
"FlaxAutoModelForTokenClassification",
]
)
# Flax models structure
_import_structure["models.bart"].extend(
[
"FlaxBartForConditionalGeneration",
@@ -1754,14 +1701,6 @@ if is_flax_available():
"FlaxBartPreTrainedModel",
]
)
_import_structure["models.beit"].extend(
[
"FlaxBeitForImageClassification",
"FlaxBeitForMaskedImageModeling",
"FlaxBeitModel",
"FlaxBeitPreTrainedModel",
]
)
_import_structure["models.bert"].extend(
[
"FlaxBertForMaskedLM",
@@ -1842,13 +1781,6 @@ if is_flax_available():
]
)
_import_structure["models.mt5"].extend(["FlaxMT5ForConditionalGeneration", "FlaxMT5Model"])
_import_structure["models.pegasus"].extend(
[
"FlaxPegasusForConditionalGeneration",
"FlaxPegasusModel",
"FlaxPegasusPreTrainedModel",
]
)
_import_structure["models.roberta"].extend(
[
"FlaxRobertaForMaskedLM",
@@ -1953,7 +1885,6 @@ if TYPE_CHECKING:
is_optuna_available,
is_ray_available,
is_ray_tune_available,
is_sigopt_available,
is_tensorboard_available,
is_wandb_available,
)
@@ -2031,12 +1962,10 @@ if TYPE_CHECKING:
from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer
from .models.encoder_decoder import EncoderDecoderConfig
from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig, FNetTokenizer
from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer
from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer
from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer
from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig
from .models.gptj import GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTJConfig
from .models.herbert import HerbertTokenizer
from .models.hubert import HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, HubertConfig
from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
@@ -2061,7 +1990,7 @@ if TYPE_CHECKING:
from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
from .models.mt5 import MT5Config
from .models.openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, OpenAIGPTTokenizer
from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer
from .models.pegasus import PegasusConfig
from .models.phobert import PhobertTokenizer
from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer
from .models.rag import RagConfig, RagRetriever, RagTokenizer
@@ -2070,14 +1999,7 @@ if TYPE_CHECKING:
from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
from .models.speech_encoder_decoder import SpeechEncoderDecoderConfig
from .models.speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
from .models.speech_to_text_2 import (
SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
Speech2Text2Config,
Speech2Text2Processor,
Speech2Text2Tokenizer,
)
from .models.splinter import SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP, SplinterConfig, SplinterTokenizer
from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
@@ -2105,7 +2027,6 @@ if TYPE_CHECKING:
# Pipelines
from .pipelines import (
AudioClassificationPipeline,
AutomaticSpeechRecognitionPipeline,
Conversation,
ConversationalPipeline,
@@ -2115,7 +2036,6 @@ if TYPE_CHECKING:
ImageClassificationPipeline,
JsonPipelineDataFormat,
NerPipeline,
ObjectDetectionPipeline,
PipedPipelineDataFormat,
Pipeline,
PipelineDataFormat,
@@ -2186,7 +2106,6 @@ if TYPE_CHECKING:
from .models.barthez import BarthezTokenizerFast
from .models.bert import BertTokenizerFast
from .models.big_bird import BigBirdTokenizerFast
from .models.blenderbot_small import BlenderbotSmallTokenizerFast
from .models.camembert import CamembertTokenizerFast
from .models.clip import CLIPTokenizerFast
from .models.convbert import ConvBertTokenizerFast
@@ -2194,7 +2113,6 @@ if TYPE_CHECKING:
from .models.distilbert import DistilBertTokenizerFast
from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast
from .models.electra import ElectraTokenizerFast
from .models.fnet import FNetTokenizerFast
from .models.funnel import FunnelTokenizerFast
from .models.gpt2 import GPT2TokenizerFast
from .models.herbert import HerbertTokenizerFast
@@ -2317,7 +2235,6 @@ if TYPE_CHECKING:
load_tf_weights_in_albert,
)
from .models.auto import (
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
MODEL_FOR_CAUSAL_LM_MAPPING,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
@@ -2333,19 +2250,15 @@ if TYPE_CHECKING:
MODEL_MAPPING,
MODEL_WITH_LM_HEAD_MAPPING,
AutoModel,
AutoModelForAudioClassification,
AutoModelForCausalLM,
AutoModelForCTC,
AutoModelForImageClassification,
AutoModelForMaskedLM,
AutoModelForMultipleChoice,
AutoModelForNextSentencePrediction,
AutoModelForObjectDetection,
AutoModelForPreTraining,
AutoModelForQuestionAnswering,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
AutoModelForSpeechSeq2Seq,
AutoModelForTableQuestionAnswering,
AutoModelForTokenClassification,
AutoModelWithLMHead,
@@ -2513,7 +2426,6 @@ if TYPE_CHECKING:
DPR_READER_PRETRAINED_MODEL_ARCHIVE_LIST,
DPRContextEncoder,
DPRPretrainedContextEncoder,
DPRPreTrainedModel,
DPRPretrainedQuestionEncoder,
DPRPretrainedReader,
DPRQuestionEncoder,
@@ -2542,19 +2454,6 @@ if TYPE_CHECKING:
FlaubertModel,
FlaubertWithLMHeadModel,
)
from .models.fnet import (
FNET_PRETRAINED_MODEL_ARCHIVE_LIST,
FNetForMaskedLM,
FNetForMultipleChoice,
FNetForNextSentencePrediction,
FNetForPreTraining,
FNetForQuestionAnswering,
FNetForSequenceClassification,
FNetForTokenClassification,
FNetLayer,
FNetModel,
FNetPreTrainedModel,
)
from .models.fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel
from .models.funnel import (
FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -2587,13 +2486,6 @@ if TYPE_CHECKING:
GPTNeoPreTrainedModel,
load_tf_weights_in_gpt_neo,
)
from .models.gptj import (
GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST,
GPTJForCausalLM,
GPTJForSequenceClassification,
GPTJModel,
GPTJPreTrainedModel,
)
from .models.hubert import (
HUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
HubertForCTC,
@@ -2792,14 +2684,12 @@ if TYPE_CHECKING:
RoFormerPreTrainedModel,
load_tf_weights_in_roformer,
)
from .models.speech_encoder_decoder import SpeechEncoderDecoderModel
from .models.speech_to_text import (
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
Speech2TextForConditionalGeneration,
Speech2TextModel,
Speech2TextPreTrainedModel,
)
from .models.speech_to_text_2 import Speech2Text2ForCausalLM, Speech2Text2PreTrainedModel
from .models.splinter import (
SPLINTER_PRETRAINED_MODEL_ARCHIVE_LIST,
SplinterForQuestionAnswering,
@@ -2833,7 +2723,6 @@ if TYPE_CHECKING:
TapasForSequenceClassification,
TapasModel,
TapasPreTrainedModel,
load_tf_weights_in_tapas,
)
from .models.transfo_xl import (
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -3338,12 +3227,6 @@ if TYPE_CHECKING:
FlaxBartModel,
FlaxBartPreTrainedModel,
)
from .models.beit import (
FlaxBeitForImageClassification,
FlaxBeitForMaskedImageModeling,
FlaxBeitModel,
FlaxBeitPreTrainedModel,
)
from .models.bert import (
FlaxBertForMaskedLM,
FlaxBertForMultipleChoice,
@@ -3404,7 +3287,6 @@ if TYPE_CHECKING:
FlaxMBartPreTrainedModel,
)
from .models.mt5 import FlaxMT5ForConditionalGeneration, FlaxMT5Model
from .models.pegasus import FlaxPegasusForConditionalGeneration, FlaxPegasusModel, FlaxPegasusPreTrainedModel
from .models.roberta import (
FlaxRobertaForMaskedLM,
FlaxRobertaForMultipleChoice,

Some files were not shown because too many files have changed in this diff Show More