Compare commits

..

10 Commits

Author SHA1 Message Date
Lysandre
98af96a156 Release: v4.2.2
Some checks failed
Model templates runner / run_tests_templates (push) Has been cancelled
Release - Conda / build_and_package (push) Has been cancelled
2021-01-21 09:06:41 +01:00
Sylvain Gugger
0ed8bccc9c Fix GPT conversion script (#9676) 2021-01-21 09:00:32 +01:00
Sylvain Gugger
c5f6719040 Fix imports in conversion scripts (#9674) 2021-01-21 09:00:26 +01:00
Patrick von Platen
21d45958af [TF Led] Fix wrong decoder attention mask behavior (#9601)
* fix tf led

* remove loop file
2021-01-21 08:57:35 +01:00
Lysandre
4d4d2ce135 Remove branch overload in conda yml 2021-01-14 14:27:36 +01:00
Lysandre
1528b1007f Upload v4.2.1 to anaconda 2021-01-14 14:24:32 +01:00
Lysandre
236cc365af Release: v4.2.1
Some checks failed
Model templates runner / run_tests_templates (push) Has been cancelled
Release - Conda / build_and_package (push) Has been cancelled
2021-01-14 14:17:56 +01:00
Lysandre Debut
5b05321b56 BatchEncoding.to with device with tests (#9584) 2021-01-14 14:08:40 +01:00
Julien Plu
412d878c5e Compliancy with tf-nightly (#9570)
* Compliancy with tf-nightly

* Add more version + restore min version check
2021-01-14 14:08:30 +01:00
Sylvain Gugger
59fbd64b1c Fix Trainer with a parallel model (#9578)
* Fix Trainer with a parallel model

* More clean up
2021-01-14 14:08:19 +01:00
706 changed files with 16383 additions and 80633 deletions

View File

@@ -3,6 +3,7 @@ orbs:
gcp-gke: circleci/gcp-gke@1.0.4
go: circleci/go@1.3.0
# TPU REFERENCES
references:
checkout_ml_testing: &checkout_ml_testing
@@ -68,8 +69,6 @@ jobs:
- image: circleci/python:3.6
environment:
OMP_NUM_THREADS: 1
RUN_PT_TF_CROSS_TESTS: yes
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -78,45 +77,14 @@ jobs:
keys:
- v0.4-torch_and_tf-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
- run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
- save_cache:
key: v0.4-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
- store_artifacts:
path: ~/transformers/tests_output.txt
- store_artifacts:
path: ~/transformers/reports
run_tests_torch_and_flax:
working_directory: ~/transformers
docker:
- image: circleci/python:3.6
environment:
OMP_NUM_THREADS: 1
RUN_PT_FLAX_CROSS_TESTS: yes
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
- checkout
- restore_cache:
keys:
- v0.4-torch_and_flax-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
- save_cache:
key: v0.4-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax ./tests/ -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
- run: RUN_PT_TF_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
- store_artifacts:
path: ~/transformers/tests_output.txt
- store_artifacts:
@@ -128,7 +96,6 @@ jobs:
- image: circleci/python:3.7
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -137,10 +104,9 @@ jobs:
keys:
- v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
- run: pip install .[sklearn,torch,testing,sentencepiece]
- run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
- save_cache:
key: v0.4-torch-{{ checksum "setup.py" }}
paths:
@@ -157,7 +123,6 @@ jobs:
- image: circleci/python:3.7
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -184,7 +149,6 @@ jobs:
- image: circleci/python:3.7
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -194,7 +158,7 @@ jobs:
- v0.4-flax-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: sudo pip install .[flax,testing,sentencepiece]
- run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
- save_cache:
key: v0.4-flax-{{ checksum "setup.py" }}
paths:
@@ -211,8 +175,6 @@ jobs:
- image: circleci/python:3.7
environment:
OMP_NUM_THREADS: 1
RUN_PIPELINE_TESTS: yes
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -221,15 +183,14 @@ jobs:
keys:
- v0.4-torch-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
- run: pip install .[sklearn,torch,testing,sentencepiece]
- run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
- save_cache:
key: v0.4-torch-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
- run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
- store_artifacts:
path: ~/transformers/tests_output.txt
- store_artifacts:
@@ -241,8 +202,6 @@ jobs:
- image: circleci/python:3.7
environment:
OMP_NUM_THREADS: 1
RUN_PIPELINE_TESTS: yes
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -257,7 +216,7 @@ jobs:
key: v0.4-tf-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
- run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
- store_artifacts:
path: ~/transformers/tests_output.txt
- store_artifacts:
@@ -269,7 +228,6 @@ jobs:
- image: circleci/python:3.7
environment:
RUN_CUSTOM_TOKENIZERS: yes
TRANSFORMERS_IS_CI: yes
steps:
- checkout
- restore_cache:
@@ -295,7 +253,6 @@ jobs:
- image: circleci/python:3.6
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -311,7 +268,7 @@ jobs:
key: v0.4-torch_examples-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
- run: python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
- store_artifacts:
path: ~/transformers/examples_output.txt
- store_artifacts:
@@ -321,9 +278,6 @@ jobs:
working_directory: ~/transformers
docker:
- image: circleci/python:3.7
environment:
RUN_GIT_LFS_TESTS: yes
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -334,7 +288,7 @@ jobs:
git config --global user.name "ci"
- run: pip install --upgrade pip
- run: pip install .[testing]
- run: python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
- run: RUN_GIT_LFS_TESTS=1 python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
build_doc:
working_directory: ~/transformers
@@ -346,14 +300,13 @@ jobs:
keys:
- v0.4-build_doc-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }}
- run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
- run: pip install --upgrade pip
- run: pip install ."[all, docs]"
- save_cache:
key: v0.4-build_doc-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: cd docs && make html SPHINXOPTS="-W -j 4"
- run: cd docs && make html SPHINXOPTS="-W"
- store_artifacts:
path: ./docs/_build
@@ -398,14 +351,12 @@ jobs:
- '~/.cache/pip'
- run: black --check examples tests src utils
- run: isort --check-only examples tests src utils
- run: python utils/custom_init_isort.py --check_only
- run: flake8 examples tests src utils
- run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
- run: python utils/check_copies.py
- run: python utils/check_table.py
- run: python utils/check_dummies.py
- run: python utils/check_repo.py
- run: python utils/check_inits.py
check_repository_consistency:
working_directory: ~/transformers
@@ -424,7 +375,6 @@ jobs:
- image: circleci/python:3.6
environment:
OMP_NUM_THREADS: 1
TRANSFORMERS_IS_CI: yes
resource_class: xlarge
parallelism: 1
steps:
@@ -463,7 +413,6 @@ workflows:
- run_examples_torch
- run_tests_custom_tokenizers
- run_tests_torch_and_tf
- run_tests_torch_and_flax
- run_tests_torch
- run_tests_tf
- run_tests_flax
@@ -472,15 +421,15 @@ workflows:
- run_tests_git_lfs
- build_doc
- deploy_doc: *workflow_filters
# tpu_testing_jobs:
# triggers:
# - schedule:
# # Set to run at the first minute of every hour.
# cron: "0 8 * * *"
# filters:
# branches:
# only:
# - master
# jobs:
# - cleanup-gke-jobs
# - run_examples_tpu
tpu_testing_jobs:
triggers:
- schedule:
# Set to run at the first minute of every hour.
cron: "0 8 * * *"
filters:
branches:
only:
- master
jobs:
- cleanup-gke-jobs
- run_examples_tpu

View File

@@ -3,7 +3,6 @@ cd docs
function deploy_doc(){
echo "Creating doc at commit $1 and pushing to folder $2"
git checkout $1
pip install -U ..
if [ ! -z "$2" ]
then
if [ "$2" == "master" ]; then
@@ -46,7 +45,7 @@ deploy_doc "6f5a12a" v2.7.0
deploy_doc "11c3257" v2.8.0
deploy_doc "e7cfc1a" v2.9.0
deploy_doc "7cb203f" v2.9.1
deploy_doc "10d7239" v2.10.0
deploy_doc "10d7239" v2.10.0
deploy_doc "b42586e" v2.11.0
deploy_doc "7fb8bdf" v3.0.2
deploy_doc "4b3ee9c" v3.1.0
@@ -54,10 +53,5 @@ deploy_doc "3ebb1b3" v3.2.0
deploy_doc "0613f05" v3.3.1
deploy_doc "eb0e0ce" v3.4.0
deploy_doc "818878d" v3.5.1
deploy_doc "c781171" v4.0.1
deploy_doc "bfa4ccf" v4.1.1
deploy_doc "7d9a9d0" v4.2.2
deploy_doc "bae0c79" v4.3.3
deploy_doc "c988db5" v4.4.0
deploy_doc "c5d6a28" v4.4.1
deploy_doc "6bc89ed" # v4.4.2 Latest stable release
deploy_doc "c781171" v4.0.0
deploy_doc "bfa4ccf" # v4.1.1 Latest stable release

3
.gitattributes vendored
View File

@@ -1,3 +0,0 @@
*.py eol=lf
*.rst eol=lf
*.md eol=lf

View File

@@ -25,44 +25,32 @@ assignees: ''
If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
Please tag fewer than 3 people.
Models:
- albert, bert, xlm: @LysandreJik
- blenderbot, bart, marian, pegasus, encoderdecoder, t5: @patrickvonplaten, @patil-suraj
- longformer, reformer, transfoxl, xlnet: @patrickvonplaten
- fsmt: @stas00
- funnel: @sgugger
- gpt2: @patrickvonplaten, @LysandreJik
- rag: @patrickvonplaten, @lhoestq
- tensorflow: @LysandreJik
Library:
- benchmarks: @patrickvonplaten
- deepspeed: @stas00
- ray/raytune: @richardliaw, @amogkam
- text generation: @patrickvonplaten
- tokenizers: @LysandreJik
- trainer: @sgugger
- pipelines: @LysandreJik
Documentation: @sgugger
Model hub:
- for issues with a model report at https://discuss.huggingface.co/ and tag the model's creator.
HF projects:
- datasets: [different repo](https://github.com/huggingface/datasets)
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
Examples:
- maintained examples (not research project or legacy): @sgugger, @patil-suraj
- research_projects/bert-loses-patience: @JetRunner
- research_projects/distillation: @VictorSanh
albert, bert, GPT2, XLM: @LysandreJik
tokenizers: @mfuntowicz
Trainer: @sgugger
Speed and Memory Benchmarks: @patrickvonplaten
Model Cards: @julien-c
TextGeneration: @TevenLeScao
examples/distillation: @VictorSanh
nlp datasets: [different repo](https://github.com/huggingface/nlp)
rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
Text Generation: @patrickvonplaten @TevenLeScao
Blenderbot: @patrickvonplaten
Bart: @patrickvonplaten
Marian: @patrickvonplaten
Pegasus: @patrickvonplaten
mBART: @patrickvonplaten
T5: @patrickvonplaten
Longformer/Reformer: @patrickvonplaten
TransfoXL/XLNet: @TevenLeScao
RAG: @patrickvonplaten, @lhoestq
FSMT: @stas00
examples/seq2seq: @patil-suraj
examples/bert-loses-patience: @JetRunner
ray/raytune: @richardliaw @amogkam
tensorflow: @jplu
examples/token-classification: @stefan-it
documentation: @sgugger
-->
## Information

View File

@@ -37,38 +37,26 @@ members/contributors which may be interested in your PR.
If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
Please tag fewer than 3 people.
Models:
- albert, bert, xlm: @LysandreJik
- blenderbot, bart, marian, pegasus, encoderdecoder, t5: @patrickvonplaten, @patil-suraj
- longformer, reformer, transfoxl, xlnet: @patrickvonplaten
- fsmt: @stas00
- funnel: @sgugger
- gpt2: @patrickvonplaten, @LysandreJik
- rag: @patrickvonplaten, @lhoestq
- tensorflow: @LysandreJik
Library:
- benchmarks: @patrickvonplaten
- deepspeed: @stas00
- ray/raytune: @richardliaw, @amogkam
- text generation: @patrickvonplaten
- tokenizers: @n1t0, @LysandreJik
- trainer: @sgugger
- pipelines: @LysandreJik
Documentation: @sgugger
HF projects:
- datasets: [different repo](https://github.com/huggingface/datasets)
- rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
Examples:
- maintained examples (not research project or legacy): @sgugger, @patil-suraj
- research_projects/bert-loses-patience: @JetRunner
- research_projects/distillation: @VictorSanh
albert, bert, XLM: @LysandreJik
GPT2: @LysandreJik, @patrickvonplaten
tokenizers: @mfuntowicz
Trainer: @sgugger
Benchmarks: @patrickvonplaten
Model Cards: @julien-c
examples/distillation: @VictorSanh
nlp datasets: [different repo](https://github.com/huggingface/nlp)
rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
Text Generation: @patrickvonplaten, @TevenLeScao
Blenderbot, Bart, Marian, Pegasus: @patrickvonplaten
T5: @patrickvonplaten
Rag: @patrickvonplaten, @lhoestq
EncoderDecoder: @patrickvonplaten
Longformer, Reformer: @patrickvonplaten
TransfoXL, XLNet: @TevenLeScao, @patrickvonplaten
examples/seq2seq: @patil-suraj
examples/bert-loses-patience: @JetRunner
tensorflow: @jplu
examples/token-classification: @stefan-it
documentation: @sgugger
FSMT: @stas00
-->

View File

@@ -14,7 +14,7 @@ requirements:
host:
- python
- pip
- numpy >=1.17
- numpy
- dataclasses
- packaging
- filelock
@@ -23,10 +23,10 @@ requirements:
- sacremoses
- regex !=2019.12.17
- protobuf
- tokenizers >=0.10.1,<0.11.0
- tokenizers ==0.9.4
run:
- python
- numpy >=1.17
- numpy
- dataclasses
- packaging
- filelock
@@ -35,7 +35,7 @@ requirements:
- sacremoses
- regex !=2019.12.17
- protobuf
- tokenizers >=0.10.1,<0.11.0
- tokenizers ==0.9.4
test:
imports:

18
.github/stale.yml vendored Normal file
View File

@@ -0,0 +1,18 @@
# Number of days of inactivity before an issue becomes stale
daysUntilStale: 60
# Number of days of inactivity before a stale issue is closed
daysUntilClose: 7
# Issues with these labels will never be considered stale
exemptLabels:
- pinned
- security
- Feature request
# Label to use when marking an issue as stale
staleLabel: wontfix
# Comment to post when marking an issue as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs. Thank you
for your contributions.
# Comment to post when closing a stale issue. Set to `false` to disable
closeComment: false

View File

@@ -1,13 +1,12 @@
name: Model templates runner
on:
pull_request:
push:
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
types: [assigned, opened, synchronize, reopened]
jobs:
run_tests_templates:

View File

@@ -10,90 +10,143 @@ on:
- "tests/**"
- ".github/**"
- "templates/**"
# pull_request:
repository_dispatch:
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
jobs:
run_tests_torch_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
runs-on: [self-hosted, gpu, single-gpu]
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
- uses: actions/checkout@v2
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
# - name: Create model files
# run: |
# source .env/bin/activate
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- name: Run all non-slow tests on GPU
env:
OMP_NUM_THREADS: 1
CUDA_VISIBLE_DEVICES: 0
run: |
python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_torch_gpu_test_reports
path: reports
run_tests_tf_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
runs-on: [self-hosted, gpu, single-gpu]
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
- uses: actions/checkout@v2
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece]
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Create model files
run: |
source .env/bin/activate
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
- name: Run all non-slow tests on GPU
env:
TF_NUM_INTRAOP_THREADS: 8
TF_NUM_INTEROP_THREADS: 1
OMP_NUM_THREADS: 1
CUDA_VISIBLE_DEVICES: 0
run: |
python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_tf_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
@@ -101,42 +154,58 @@ jobs:
name: run_all_tests_tf_gpu_test_reports
path: reports
run_tests_torch_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
- uses: actions/checkout@v2
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all non-slow tests on GPU
env:
MKL_SERVICE_FORCE_INTEL: 1
OMP_NUM_THREADS: 1
run: |
python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_multi_gpu_failures_short.txt
run: cat reports/tests_torch_multi_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
@@ -146,34 +215,52 @@ jobs:
path: reports
run_tests_tf_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
- uses: actions/checkout@v2
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece]
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all non-slow tests on GPU
env:
TF_NUM_INTRAOP_THREADS: 8
TF_NUM_INTEROP_THREADS: 1
OMP_NUM_THREADS: 1
run: |
python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
@@ -185,22 +272,4 @@ jobs:
with:
name: run_all_tests_tf_multi_gpu_test_reports
path: reports
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
run: |
pip install slack_sdk
python utils/notification_service.py push

View File

@@ -1,66 +1,82 @@
# configuration notes:
#
# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
# the step uses the system-wide python interpreter.
name: Self-hosted runner (scheduled)
on:
push:
branches:
- multi_ci_*
repository_dispatch:
schedule:
- cron: "0 0 * * *"
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
RUN_SLOW: yes
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
jobs:
run_all_tests_torch_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
runs-on: [self-hosted, gpu, single-gpu]
steps:
- name: Launcher docker
uses: actions/checkout@v2
- uses: actions/checkout@v2
- name: NVIDIA-SMI
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v 1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_gpu_failures_short.txt
- name: Run examples tests on GPU
if: ${{ always() }}
env:
OMP_NUM_THREADS: 16
MKL_NUM_THREADS: 16
OMP_NUM_THREADS: 1
RUN_SLOW: yes
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
run: |
source .env/bin/activate
pip install -r examples/_tests_requirements.txt
python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
- name: Failure short reports
if: ${{ always() }}
@@ -69,9 +85,13 @@ jobs:
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
run: |
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
- name: Failure short reports
if: ${{ always() }}
@@ -84,36 +104,60 @@ jobs:
name: run_all_tests_torch_gpu_test_reports
path: reports
run_all_tests_tf_gpu:
runs-on: [self-hosted, docker-gpu, single-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Launcher docker
uses: actions/checkout@v2
- name: NVIDIA-SMI
run_all_tests_tf_gpu:
runs-on: [self-hosted, gpu, single-gpu]
steps:
- uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnx,sentencepiece]
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on GPU
env:
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_tf_gpu_failures_short.txt
@@ -121,15 +165,17 @@ jobs:
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
run: |
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
@@ -137,49 +183,86 @@ jobs:
with:
name: run_all_tests_tf_gpu_test_reports
path: reports
run_all_tests_torch_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- name: Launcher docker
uses: actions/checkout@v2
- uses: actions/checkout@v2
- name: NVIDIA-SMI
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
apt -y update && apt install -y libsndfile1-dev
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
python -c "import torch; print('Cuda version:', torch.version.cuda)"
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
- name: Run all tests on GPU
- name: Run all tests on multi-GPU
env:
MKL_SERVICE_FORCE_INTEL: 1
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_multi_gpu_failures_short.txt
- name: Run all pipeline tests on GPU
- name: Run examples tests on multi-GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
- name: Run all pipeline tests on multi-GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
run: |
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
@@ -193,48 +276,73 @@ jobs:
path: reports
run_all_tests_tf_multi_gpu:
runs-on: [self-hosted, docker-gpu, multi-gpu]
container:
image: tensorflow/tensorflow:2.4.1-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
runs-on: [self-hosted, gpu, multi-gpu]
steps:
- name: Launcher docker
uses: actions/checkout@v2
- uses: actions/checkout@v2
- name: NVIDIA-SMI
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
nvidia-smi
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[sklearn,testing,onnx,sentencepiece]
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on GPU
- name: Run all tests on multi-GPU
env:
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_tf_multi_gpu_failures_short.txt
- name: Run all pipeline tests on GPU
- name: Run all pipeline tests on multi-GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
TF_NUM_INTEROP_THREADS: 1
TF_NUM_INTRAOP_THREADS: 16
run: |
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt
@@ -245,23 +353,4 @@ jobs:
with:
name: run_all_tests_tf_multi_gpu_test_reports
path: reports
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
run: |
pip install slack_sdk
python utils/notification_service.py scheduled

View File

@@ -1,27 +0,0 @@
name: Stale Bot
on:
schedule:
- cron: "0 0 * * *"
jobs:
close_stale_issues:
name: Close Stale Issues
if: github.repository == 'huggingface/transformers'
runs-on: ubuntu-latest
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
- name: Setup Python
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Install requirements
run: |
pip install PyGithub
- name: Close stale issues
run: |
python scripts/stale.py

View File

@@ -207,8 +207,6 @@ You are not required to read the following guidelines before opening an issue. H
Do not dispair if you can't figure it out from the begining, just share what you can and perhaps someone else will be able to help you at the forums.
If your setup involves any custom datasets, the best way to help us reproduce the problem is to create a [Google Colab notebook](https://colab.research.google.com/) that demonstrates the issue and once you verify that the issue still exists, include a link to that notebook in the Issue. Just make sure that you don't copy and paste the location bar url of the open notebook - as this is private and we won't be able to open it. Instead, you need to click on `Share` in the right upper corner of the notebook, select `Get Link` and then copy and paste the public link it will give to you.
7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.
8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.

View File

@@ -19,44 +19,33 @@ modified_only_fixup:
deps_table_update:
@python setup.py deps_table_update
# autogenerating code
autogenerate_code: deps_table_update
python utils/class_mapping_update.py
# Check that source code meets quality standards
extra_quality_checks:
extra_quality_checks: deps_table_update
python utils/check_copies.py
python utils/check_table.py
python utils/check_dummies.py
python utils/check_repo.py
python utils/check_inits.py
python utils/style_doc.py src/transformers docs/source --max_len 119
# this target runs checks on all files
quality:
black --check $(check_dirs)
isort --check-only $(check_dirs)
python utils/custom_init_isort.py --check_only
flake8 $(check_dirs)
python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
${MAKE} extra_quality_checks
# Format source code automatically and check is there are any problems left that need manual fixing
extra_style_checks:
python utils/custom_init_isort.py
python utils/style_doc.py src/transformers docs/source --max_len 119
# this target runs checks on all files and potentially modifies some of them
style:
style: deps_table_update
black $(check_dirs)
isort $(check_dirs)
${MAKE} autogenerate_code
${MAKE} extra_style_checks
python utils/style_doc.py src/transformers docs/source --max_len 119
# Super fast fix and check target that only works on relevant modified files since the branch was made
fixup: modified_only_fixup extra_style_checks autogenerate_code extra_quality_checks
fixup: modified_only_fixup extra_quality_checks
# Make marked copies of snippets of codes conform to the original
@@ -75,27 +64,7 @@ test:
test-examples:
python -m pytest -n auto --dist=loadfile -s -v ./examples/
# Run tests for SageMaker DLC release
test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
TEST_SAGEMAKER=True python -m pytest -n auto -s -v ./tests/sagemaker
# Check that docs can build
docs:
cd docs && make html SPHINXOPTS="-W -j 4"
# Release stuff
pre-release:
python utils/release.py
pre-patch:
python utils/release.py --patch
post-release:
python utils/release.py --post_release
post-patch:
python utils/release.py --post_release --patch

View File

@@ -55,7 +55,7 @@ Here are a few examples:
- [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
- [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
- [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
- [Natural Language Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
- [Natural Langugage Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
- [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
- [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
@@ -167,7 +167,7 @@ When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be
pip install transformers
```
If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).
If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).
### With conda
@@ -179,7 +179,7 @@ Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
conda install -c huggingface transformers
```
Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
## Models architectures
@@ -194,15 +194,11 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
@@ -213,34 +209,27 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)

View File

@@ -26,7 +26,7 @@ pip install -e ".[docs]"
---
**NOTE**
You only need to generate the documentation to inspect it locally (if you're planning changes and want to
You only need to generate the documentation to inspect it locally (if you're planning changes and want to
check how they look like before committing for instance). You don't have to commit the built documentation.
---
@@ -65,7 +65,7 @@ make html
```
A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
browser.
browser.
---
**NOTE**
@@ -95,15 +95,15 @@ following these steps:
expand them).
- Click on "details" next to the `ci/circleci: build_doc` check.
- In the new window, click on the "Artifacts" tab.
- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a
- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a
preview.
## Writing Documentation - Specification
The `huggingface/transformers` documentation follows the
[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
mostly written in ReStructuredText
([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html),
mostly written in ReStructuredText
([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html),
[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)).
@@ -121,8 +121,8 @@ four.
### Adding a new model
When adding a new model:
- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template).
- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template).
- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
- Write a short overview of the model:
- Overview with paper & authors
@@ -130,8 +130,8 @@ When adding a new model:
- Tips and tricks and how to use it best
- Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
The order is generally:
- Configuration,
The order is generally:
- Configuration,
- Tokenizer
- PyTorch base model
- PyTorch head models
@@ -179,7 +179,7 @@ Links should be done as so (note the double underscore at the end): \`text for t
#### Defining arguments in a method
Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
The argument should be followed by its type, with its shape if it is a tensor, and a line return.
Another indentation is necessary before writing the description of the argument.
@@ -216,9 +216,9 @@ then its documentation should look like this:
Note that we always omit the "defaults to :obj:\`None\`" when None is the default for any argument. Also note that even
if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
however write as many lines as you want in the indented description (see the example above with `input_ids`).
however write as many lines as you want in the indented description (see the example above with `input_ids`).
#### Writing a multi-line code block
#### Writing a multi-line code block
Multi-line code blocks can be useful for displaying examples. They are done like so:
@@ -237,7 +237,7 @@ the results stay consistent with the library.
#### Writing a return block
Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
The first line should be the type of the return, followed by a line return. No need to indent further for the elements
building the return.
@@ -258,43 +258,3 @@ Here's an example for a single value return:
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
```
#### Adding a new section
In ReST section headers are designated as such with the help of a line of underlying characters, e.g.,:
```
Section 1
^^^^^^^^^^^^^^^^^^
Sub-section 1
~~~~~~~~~~~~~~~~~~
```
ReST allows the use of any characters to designate different section levels, as long as they are used consistently within the same document. For details see [sections doc](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#sections). Because there is no standard different documents often end up using different characters for the same levels which makes it very difficult to know which character to use when creating a new section.
Specifically, if when running `make docs` you get an error like:
```
docs/source/main_classes/trainer.rst:127:Title level inconsistent:
```
you picked an inconsistent character for some of the levels.
But how do you know which characters you must use for an already existing level or when adding a new level?
You can use this helper script:
```
perl -ne '/^(.)\1{100,}/ && do { $h{$1}=++$c if !$h{$1} }; END { %h = reverse %h ; print "$_ $h{$_}\n" for sort keys %h}' docs/source/main_classes/trainer.rst
1 -
2 ~
3 ^
4 =
5 "
```
This tells you which characters have already been assigned for each level.
So using this particular example's output -- if your current section's header uses `=` as its underline character, you now know you're at level 4, and if you want to add a sub-section header you know you want `"` as it'd level 5.
If you needed to add yet another sub-level, then pick a character that is not used already. That is you must pick a character that is not in the output of that script.
Here is the full list of characters that can be used in this context: `= - ` : ' " ~ ^ _ * + # < >`

View File

@@ -1,13 +1,10 @@
// These two things need to be updated at each release for the version selector.
// Last stable version
const stableVersion = "v4.4.2"
const stableVersion = "v4.1.1"
// Dictionary doc folder to label. The last stable version should have an empty key.
const versionMapping = {
"master": "master",
"": "v4.4.0/v4.4.1/v4.4.2 (stable)",
"v4.3.3": "v4.3.0/v4.3.1/v4.3.2/v4.3.3",
"v4.2.2": "v4.2.0/v4.2.1/v4.2.2",
"v4.1.1": "v4.1.0/v4.1.1",
"": "v4.1.1 (stable)",
"v4.0.1": "v4.0.0/v4.0.1",
"v3.5.1": "v3.5.0/v3.5.1",
"v3.4.0": "v3.4.0",
@@ -62,7 +59,7 @@ function addIcon() {
function addCustomFooter() {
const customFooter = document.createElement("div");
const questionOrIssue = document.createElement("div");
questionOrIssue.innerHTML = "Stuck? Read our <a href='https://huggingface.co/blog'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
customFooter.appendChild(questionOrIssue);
customFooter.classList.add("footer");
@@ -129,11 +126,11 @@ function addVersionControl() {
const parts = location.toString().split('/');
let versionIndex = parts.length - 2;
// Index page may not have a last part with filename.html so we need to go up
if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html/)) {
if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html$|^search.html?/)) {
versionIndex = parts.length - 1;
}
// Main classes and models are nested so we need to go deeper
else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc" || parts[versionIndex] == "internal") {
else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc") {
versionIndex = versionIndex - 1;
}
const version = parts[versionIndex];

View File

@@ -1,844 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
How to add a model to 🤗 Transformers?
=======================================================================================================================
Adding a new model is often difficult and requires an in-depth knowledge of the 🤗 Transformers library and ideally also
of the model's original repository. At Hugging Face, we are trying to empower the community more and more to add models
independently. Thus, for some new models that the community wants to be added to 🤗 Transformers, we create a customized
*call-for-model-addition* that explains step-by-step how to add the requested model. With this
*call-for-model-addition*, we want to teach a motivated and experienced contributor of the community how to port a
model to 🤗 Transformers.
If this sounds like something you would be interested in, feel free to check out the currently open
“calls-for-model-addition” `here
<https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model/open_model_proposals/README.md>`__
and to contact us.
If selected, you will then work closely with one member of the Hugging Face team to integrate the model into 🤗
Transformers. By doing so, you will both gain a theoretical and deep practical understanding of the proposed model. But
more importantly, you will have made a major open-source contribution to 🤗 Transformers. Along the way, you will:
- get insights into open-source best practices
- understand the design principles of one of the most popular NLP libraries
- learn how to do efficiently test large NLP models
- learn how to integrate Python utilities like ``black``, ``isort``, ``make fix-copies`` into a library to always
ensure clean and readable code
We are also more than happy if you want to add a model that cannot be found in the “calls-for-model-addition” folder.
The following sections explain in detail how to add a new model. It might also be very helpful to check out already
added models to see if those resemble the model you would like to add `here
<https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed>`__.
To start, let's try to get a general overview of the Transformers library.
General overview of 🤗 Transformers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
First, you should get a general overview of 🤗 Transformers. 🤗 Transformers is a very opinionated library, so there is a
chance that you don't agree with some of the library's philosophies or design choices. From our experience, however, we
found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗
Transformers while keeping maintenance costs at a reasonable level.
A good first starting point to better understand the library is to read the :doc:`documentation of our philosophy
<philosophy>`. As a result of our way of working, there are some choices that we try to apply to all models:
- Composition is generally favored over-abstraction
- Duplicating code is not always bad if it strongly improves the readability or accessibility of a model
- Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only
have to look into the respective ``modeling_....py`` file.
In our opinion, the library's code is not just a means to provide a product, *e.g.* the ability to use BERT for
inference, but also as the very product that we want to improve. Hence, when adding a model, the user is not only the
person that will use your model, but also everybody that will read, try to understand, and possibly tweak your code.
With this in mind, let's go a bit deeper into the general library design.
Overview of models
-----------------------------------------------------------------------------------------------------------------------
To successfully add a model, it is important to understand the interaction between your model and its config,
:class:`~transformers.PreTrainedModel`, and :class:`~transformers.PretrainedConfig`. For exemplary purposes, we will
call the model to be added to 🤗 Transformers ``BrandNewBert``.
Let's take a look:
.. image:: ./imgs/transformers_overview.png
As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
minimum. There are never more than two levels of abstraction for any model in the library. :obj:`BrandNewBertModel`
inherits from :obj:`BrandNewBertPreTrainedModel` which in turn inherits from :class:`~transformres.PreTrainedModel` and
that's it. As a general rule, we want to make sure that a new model only depends on
:class:`~transformers.PreTrainedModel`. The important functionalities that are automatically provided to every new
model are :meth:`~transformers.PreTrainedModel.from_pretrained` and
:meth:`~transformers.PreTrainedModel.save_pretrained`, which are used for serialization and deserialization. All of the
other important functionalities, such as :meth:`BrandNewBertModel.forward` should be completely defined in the new
``modeling_brand_new_bert.py`` script. Next, we want to make sure that a model with a specific head layer, such as
:obj:`BrandNewBertForMaskedLM` does not inherit from :obj:`BrandNewBertModel`, but rather uses :obj:`BrandNewBertModel`
as a component that can be called in its forward pass to keep the level of abstraction low. Every new model requires a
configuration class, called :obj:`BrandNewBertConfig`. This configuration is always stored as an attribute in
:class:`~transformers.PreTrainedModel`, and thus can be accessed via the ``config`` attribute for all classes
inheriting from :obj:`BrandNewBertPreTrainedModel`:
.. code:: python
model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
model.config # model has access to its config
Similar to the model, the configuration inherits basic serialization and deserialization functionalities from
:class:`~transformers.PretrainedConfig`. Note that the configuration and the model are always serialized into two
different formats - the model to a `pytorch_model.bin` file and the configuration to a `config.json` file. Calling
:meth:`~transformers.PreTrainedModel.save_pretrained` will automatically call
:meth:`~transformers.PretrainedConfig.save_pretrained`, so that both model and configuration are saved.
Overview of tokenizers
-----------------------------------------------------------------------------------------------------------------------
Not quite ready yet :-( This section will be added soon!
Step-by-step recipe to add a model to 🤗 Transformers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries
of how other contributors ported models to Hugging Face. Here is a list of community blog posts on how to port a model:
1. `Porting GPT2 Model <https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28>`__ by `Thomas
<https://huggingface.co/thomwolf>`__
2. `Porting WMT19 MT Model <https://huggingface.co/blog/porting-fsmt>`__ by `Stas <https://huggingface.co/stas>`__
From experience, we can tell you that the most important things to keep in mind when adding a model are:
- Don't reinvent the wheel! Most parts of the code you will add for the new 🤗 Transformers model already exist
somewhere in 🤗 Transformers. Take some time to find similar, already existing models and tokenizers you can copy
from. `grep <https://www.gnu.org/software/grep/>`__ and `rg <https://github.com/BurntSushi/ripgrep>`__ are your
friends. Note that it might very well happen that your model's tokenizer is based on one model implementation, and
your model's modeling code on another one. *E.g.* FSMT's modeling code is based on BART, while FSMT's tokenizer code
is based on XLM.
- It's more of an engineering challenge than a scientific challenge. You should spend more time on creating an
efficient debugging environment than trying to understand all theoretical aspects of the model in the paper.
- Ask for help, when you're stuck! Models are the core component of 🤗 Transformers so that we at Hugging Face are more
than happy to help you at every step to add your model. Don't hesitate to ask if you notice you are not making
progress.
In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers.
The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do
List:
- 1. ☐ (Optional) Understood theoretical aspects
- 2. ☐ Prepared transformers dev environment
- 3. ☐ Set up debugging environment of the original repository
- 4. ☐ Created script that successfully runs forward pass using original repository and checkpoint
- 5. ☐ Successfully added the model skeleton to Transformers
- 6. ☐ Successfully converted original checkpoint to Transformers checkpoint
- 7. ☐ Successfully ran forward pass in Transformers that gives identical output to original checkpoint
- 8. ☐ Finished model tests in Transformers
- 9. ☐ Successfully added Tokenizer in Transformers
- 10. ☐ Run end-to-end integration tests
- 11. ☐ Finished docs
- 12. ☐ Uploaded model weights to the hub
- 13. ☐ Submitted the pull request
- 14. ☐ (Optional) Added a demo notebook
To begin with, we usually recommend to start by getting a good theoretical understanding of ``BrandNewBert``. However,
if you prefer to understand the theoretical aspects of the model *on-the-job*, then it is totally fine to directly dive
into the ``BrandNewBert``'s code-base. This option might suit you better, if your engineering skills are better than
your theoretical skill, if you have trouble understanding ``BrandNewBert``'s paper, or if you just enjoy programming
much more than reading scientific papers.
1. (Optional) Theoretical aspects of BrandNewBert
-----------------------------------------------------------------------------------------------------------------------
You should take some time to read *BrandNewBert's* paper, if such descriptive work exists. There might be large
sections of the paper that are difficult to understand. If this is the case, this is fine - don't worry! The goal is
not to get a deep theoretical understanding of the paper, but to extract the necessary information required to
effectively re-implement the model in 🤗 Transformers. That being said, you don't have to spend too much time on the
theoretical aspects, but rather focus on the practical ones, namely:
- What type of model is *brand_new_bert*? BERT-like encoder-only model? GPT2-like decoder-only model? BART-like
encoder-decoder model? Look at the :doc:`model_summary` if you're not familiar with the differences between those.
- What are the applications of *brand_new_bert*? Text classification? Text generation? Seq2Seq tasks, *e.g.,*
summarization?
- What is the novel feature of the model making it different from BERT/GPT-2/BART?
- Which of the already existing `🤗 Transformers models <https://huggingface.co/transformers/#contents>`__ is most
similar to *brand_new_bert*?
- What type of tokenizer is used? A sentencepiece tokenizer? Word piece tokenizer? Is it the same tokenizer as used
for BERT or BART?
After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the
Hugging Face team with any questions you might have. This might include questions regarding the model's architecture,
its attention layer, etc. We will be more than happy to help you.
2. Next prepare your environment
-----------------------------------------------------------------------------------------------------------------------
1. Fork the `repository <https://github.com/huggingface/transformers>`__ by clicking on the Fork' button on the
repository's page. This creates a copy of the code under your GitHub user account.
2. Clone your ``transformers`` fork to your local disk, and add the base repository as a remote:
.. code:: bash
git clone https://github.com/[your Github handle]/transformers.git
cd transformers
git remote add upstream https://github.com/huggingface/transformers.git
3. Set up a development environment, for instance by running the following command:
.. code:: bash
python -m venv .env
source .env/bin/activate
pip install -e ".[dev]"
and return to the parent directory
.. code:: bash
cd ..
4. We recommend adding the PyTorch version of *brand_new_bert* to Transformers. To install PyTorch, please follow the
instructions on https://pytorch.org/get-started/locally/.
**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient.
5. To port *brand_new_bert*, you will also need access to its original repository:
.. code:: bash
git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
cd brand_new_bert
pip install -e .
Now you have set up a development environment to port *brand_new_bert* to 🤗 Transformers.
3.-4. Run a pretrained checkpoint using the original repository
-----------------------------------------------------------------------------------------------------------------------
At first, you will work on the original *brand_new_bert* repository. Often, the original implementation is very
“researchy”. Meaning that documentation might be lacking and the code can be difficult to understand. But this should
be exactly your motivation to reimplement *brand_new_bert*. At Hugging Face, one of our main goals is to *make people
stand on the shoulders of giants* which translates here very well into taking a working model and rewriting it to make
it as **accessible, user-friendly, and beautiful** as possible. This is the number-one motivation to re-implement
models into 🤗 Transformers - trying to make complex new NLP technology accessible to **everybody**.
You should start thereby by diving into the original repository.
Successfully running the official pretrained model in the original repository is often **the most difficult** step.
From our experience, it is very important to spend some time getting familiar with the original code-base. You need to
figure out the following:
- Where to find the pretrained weights?
- How to load the pretrained weights into the corresponding model?
- How to run the tokenizer independently from the model?
- Trace one forward pass so that you know which classes and functions are required for a simple forward pass. Usually,
you only have to reimplement those functions.
- Be able to locate the important components of the model: Where is the model's class? Are there model sub-classes,
*e.g.* EncoderModel, DecoderModel? Where is the self-attention layer? Are there multiple different attention layers,
*e.g.* *self-attention*, *cross-attention*...?
- How can you debug the model in the original environment of the repo? Do you have to add `print` statements, can you
work with an interactive debugger like `ipdb`, or should you use an efficient IDE to debug the model, like PyCharm?
It is very important that before you start the porting process, that you can **efficiently** debug code in the original
repository! Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or
even a pull request in the original repository. The maintainers of this repository are most likely very happy about
someone looking into their code!
At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original
model. We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to
dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. Only
at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the
model also works as expected on GPU.
In general, there are two possible debugging environments for running the original model
- `Jupyter notebooks <https://jupyter.org/>`__ / `google colab
<https://colab.research.google.com/notebooks/intro.ipynb>`__
- Local python scripts.
Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split
logical components from one another and to have faster debugging cycles as intermediate results can be stored. Also,
notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
Face team for help. If you are familiar with Jupiter notebooks, we strongly recommend you to work with them.
The obvious disadvantage of Jupyther notebooks is that if you are not used to working with them you will have to spend
some time adjusting to the new programming environment and that you might not be able to use your known debugging tools
anymore, like ``ipdb``.
For each code-base, a good first step is always to load a **small** pretrained checkpoint and to be able to reproduce a
single forward pass using a dummy integer vector of input IDs as an input. Such a script could look like this (in
pseudocode):
.. code:: bash
model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids
original_output = model.predict(input_ids)
Next, regarding the debugging strategy, there are generally a few from which to choose from:
- Decompose the original model into many small testable components and run a forward pass on each of those for
verification
- Decompose the original model only into the original *tokenizer* and the original *model*, run a forward pass on
those, and use intermediate print statements or breakpoints for verification
Again, it is up to you which strategy to choose. Often, one or the other is advantageous depending on the original code
base.
If the original code-base allows you to decompose the model into smaller sub-components, *e.g.* if the original
code-base can easily be run in eager mode, it is usually worth the effort to do so. There are some important advantages
to taking the more difficult road in the beginning:
- at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically
for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead
of relying on visual comparison via print statements
- it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting
individual components and thus structure your work better
- separating the model into logical meaningful components will help you to get a better overview of the model's design
and thus to better understand the model
- at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue
changing your code
`Lysandre's <https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed>`__ integration checks for ELECTRA
gives a nice example of how this can be done.
However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode,
it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. A good
example is `T5's MeshTensorFlow <https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow>`__ library which is
very complex and does not offer a simple way to decompose the model into its sub-components. For such libraries, one
often relies on verifying print statements.
No matter which strategy you choose, the recommended procedure is often the same in that you should start to debug the
starting layers first and the ending layers last.
It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following
layers in the following order:
1. Retrieve the input IDs passed to the model
2. Retrieve the word embeddings
3. Retrieve the input of the first Transformer layer
4. Retrieve the output of the first Transformer layer
5. Retrieve the output of the following n - 1 Transformer layers
6. Retrieve the output of the whole BrandNewBert Model
Input IDs should thereby consists of an array of integers, *e.g.* ``input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]``
The outputs of the following layers often consist of multi-dimensional float arrays and can look like this:
.. code:: bash
[[
[-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024],
[-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132],
[-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648],
...,
[-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288],
[-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191],
[-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]],
We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original
model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001!
Since it is normal that the exact same model written in different libraries can give a slightly different output
depending on the library framework, we accept an error tolerance of 1e-3 (0.001). It is not enough if the model gives
nearly the same output, they have to be the almost identical. Therefore, you will certainly compare the intermediate
outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of
*brand_new_bert* in which case an **efficient** debugging environment of the original repository is absolutely
important. Here is some advice is to make your debugging environment as efficient as possible.
- Find the best way of debugging intermediate results. Is the original repository written in PyTorch? Then you should
probably take the time to write a longer script that decomposes the original model into smaller sub-components to
retrieve intermediate values. Is the original repository written in Tensorflow 1? Then you might have to rely on
TensorFlow print operations like `tf.print <https://www.tensorflow.org/api_docs/python/tf/print>`__ to output
intermediate values. Is the original repository written in Jax? Then make sure that the model is **not jitted** when
running the forward pass, *e.g.* check-out `this link <https://github.com/google/jax/issues/196>`__.
- Use the smallest pretrained checkpoint you can find. The smaller the checkpoint, the faster your debug cycle
becomes. It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds.
In case only very large checkpoints are available, it might make more sense to create a dummy model in the new
environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version
of your model
- Make sure you are using the easiest way of calling a forward pass in the original repository. Ideally, you want to
find the function in the original repository that **only** calls a single forward pass, *i.e.* that is often called
``predict``, ``evaluate``, ``forward`` or ``__call__``. You don't want to debug a function that calls ``forward``
multiple times, *e.g.* to generate text, like ``autoregressive_sample``, ``generate``.
- Try to separate the tokenization from the model's `forward` pass. If the original repository shows examples where
you have to input a string, then try to find out where in the forward call the string input is changed to input ids
and start from this point. This might mean that you have to possibly write a small script yourself or change the
original code so that you can directly input the ids instead of an input string.
- Make sure that the model in your debugging setup is **not** in training mode, which often causes the model to yield
random outputs due to multiple dropout layers in the model. Make sure that the forward pass in your debugging
environment is **deterministic** so that the dropout layers are not used. Or use `transformers.file_utils.set_seed`
if the old and new implementations are in the same framework.
The following section gives you more specific details/tips on how you can do this for *brand_new_bert*.
5.-14. Port BrandNewBert to 🤗 Transformers
-----------------------------------------------------------------------------------------------------------------------
Next, you can finally start adding new code to 🤗 Transformers. Go into the clone of your 🤗 Transformers' fork:
::
cd transformers
In the special case that you are adding a model whose architecture exactly matches the model architecture of an
existing model you only have to add a conversion script as described in `this section <#write-a-conversion-script>`__.
In this case, you can just re-use the whole model architecture of the already existing model.
Otherwise, let's start generating a new model with the amazing Cookiecutter!
**Use the Cookiecutter to automatically generate the model's code**
To begin with head over to the `🤗 Transformers templates
<https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model>`__ to make use of our
``cookiecutter`` implementation to automatically generate all the relevant files for your model. Again, we recommend
only adding the PyTorch version of the model at first. Make sure you follow the instructions of the ``README.md`` on
the `🤗 Transformers templates <https://github.com/huggingface/transformers/tree/master/templates/adding_a_new_model>`__
carefully.
**Open a Pull Request on the main huggingface/transformers repo**
Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)” pull
request, *e.g.* “[WIP] Add *brand_new_bert*”, in 🤗 Transformers so that you and the Hugging Face team can work
side-by-side on integrating the model into 🤗 Transformers.
You should do the following:
1. Create a branch with a descriptive name from your master branch
::
git checkout -b add_brand_new_bert
2. Commit the automatically generated code:
::
git add .
git commit
3. Fetch and rebase to current master
::
git fetch upstream
git rebase upstream/master
4. Push the changes to your account using:
::
git push -u origin a-descriptive-name-for-my-changes
5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
future changes.
6. Change the PR into a draft by clicking on “Convert to draft” on the right of the GitHub pull request web page.
In the following, whenever you have done some progress, don't forget to commit your work and push it to your account so
that it shows in the pull request. Additionally, you should make sure to update your work with the current master from
time to time by doing:
::
git fetch upstream
git merge upstream/master
In general, all questions you might have regarding the model or your implementation should be asked in your PR and
discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
if you have a question. It is often very helpful to point the Hugging Face team to your added code so that the Hugging
Face team can efficiently understand your problem or question.
To do so, you can go to the “Files changed” tab where you see all of your changes, go to a line regarding which you
want to ask a question, and click on the “+” symbol to add a comment. Whenever a question or problem has been solved,
you can click on the “Resolve” button of the created comment.
In the same way, the Hugging Face team will open comments when reviewing your code. We recommend asking most questions
on GitHub on your PR. For some very general questions that are not very useful for the public, feel free to ping the
Hugging Face team by Slack or email.
**5. Adapt the generated models code for brand_new_bert**
At first, we will focus only on the model itself and not care about the tokenizer. All the relevant code should be
found in the generated files ``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` and
``src/transformers/models/brand_new_bert/configuration_brand_new_bert.py``.
Now you can finally start coding :). The generated code in
``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` will either have the same architecture as BERT if
it's an encoder-only model or BART if it's an encoder-decoder model. At this point, you should remind yourself what
you've learned in the beginning about the theoretical aspects of the model: *How is the model different from BERT or
BART?*". Implement those changes which often means to change the *self-attention* layer, the order of the normalization
layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to
get a better feeling of how your model should be implemented.
**Note** that at this point, you don't have to be very sure that your code is fully correct or clean. Rather, it is
advised to add a first *unclean*, copy-pasted version of the original code to
``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` until you feel like all the necessary code is
added. From our experience, it is much more efficient to quickly add a first version of the required code and
improve/correct the code iteratively with the conversion script as described in the next section. The only thing that
has to work at this point is that you can instantiate the 🤗 Transformers implementation of *brand_new_bert*, *i.e.* the
following command should work:
.. code:: python
from transformers import BrandNewBertModel, BrandNewBertConfig
model = BrandNewBertModel(BrandNewBertConfig())
The above command will create a model according to the default parameters as defined in ``BrandNewBertConfig()`` with
random weights, thus making sure that the ``init()`` methods of all components works.
**6. Write a conversion script**
Next, you should write a conversion script that lets you convert the checkpoint you used to debug *brand_new_bert* in
the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of
*brand_new_bert*. It is not advised to write the conversion script from scratch, but rather to look through already
existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in
the same framework as *brand_new_bert*. Usually, it is enough to copy an already existing conversion script and
slightly adapt it for your use case. Don't hesitate to ask the Hugging Face team to point you to a similar already
existing conversion script for your model.
- If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script `here
<https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91>`__
- If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script `here
<https://github.com/huggingface/transformers/blob/master/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py>`__
In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. In PyTorch, the
name of a layer is defined by the name of the class attribute you give the layer. Let's define a dummy model in
PyTorch, called ``SimpleModel`` as follows:
.. code:: python
import torch.nn as nn
class SimpleModel(nn.Module):
def __init__(self):
super().__init__()
self.dense = nn.Linear(10, 10)
self.intermediate = nn.Linear(10, 10)
self.layer_norm = nn.LayerNorm(10)
Now we can create an instance of this model definition which will fill all weights: ``dense``, ``intermediate``,
``layer_norm`` with random weights. We can print the model to see its architecture
.. code:: python
model = SimpleModel()
print(model)
This will print out the following:
.. code:: bash
SimpleModel(
(dense): Linear(in_features=10, out_features=10, bias=True)
(intermediate): Linear(in_features=10, out_features=10, bias=True)
(layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
)
We can see that the layer names are defined by the name of the class attribute in PyTorch. You can print out the weight
values of a specific layer:
.. code:: python
print(model.dense.weight.data)
to see that the weights were randomly initialized
.. code:: bash
tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212,
-0.2077, 0.2157],
[ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190,
0.2166, -0.0212],
[-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950,
-0.1023, -0.0447],
[-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415,
-0.1876, -0.2467],
[ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
0.2577, 0.0402],
[ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604,
0.2132, 0.1680],
[ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090,
0.2707, -0.2509],
[-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407,
0.1829, -0.1568],
[-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923,
0.0333, -0.0536],
[-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739,
0.2220, 0.2358]]).
In the conversion script, you should fill those randomly initialized weights with the exact weights of the
corresponding layer in the checkpoint. *E.g.*
.. code:: python
# retrieve matching layer weights, e.g. by
# recursive algorithm
layer_name = "dense"
pretrained_weight = array_of_dense_layer
model_pointer = getattr(model, "dense")
model_pointer.weight.data = torch.from_numpy(pretrained_weight)
While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding
pretrained checkpoint weight exactly match in both **shape and name**. To do so, it is **necessary** to add assert
statements for the shape and print out the names of the checkpoints weights. E.g. you should add statements like:
.. code:: python
assert (
model_pointer.weight.shape == pretrained_weight.shape
), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
Besides, you should also print out the names of both weights to make sure they match, *e.g.*
.. code:: python
logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly
initialized layer of the 🤗 Transformers implementation.
An incorrect shape is most likely due to an incorrect setting of the config parameters in ``BrandNewBertConfig()`` that
do not exactly match those that were used for the checkpoint you want to convert. However, it could also be that
PyTorch's implementation of a layer requires the weight to be transposed beforehand.
Finally, you should also check that **all** required weights are initialized and print out all checkpoint weights that
were not used for initialization to make sure the model is correctly converted. It is completely normal, that the
conversion trials fail with either a wrong shape statement or wrong name assignment. This is most likely because either
you used incorrect parameters in ``BrandNewBertConfig()``, have a wrong architecture in the 🤗 Transformers
implementation, you have a bug in the ``init()`` functions of one of the components of the 🤗 Transformers
implementation or you need to transpose one of the checkpoint weights.
This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the
Transformers model. Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save
the model under a folder of your choice ``/path/to/converted/checkpoint/folder`` that should then contain both a
``pytorch_model.bin`` file and a ``config.json`` file:
.. code:: python
model.save_pretrained("/path/to/converted/checkpoint/folder")
**7. Implement the forward pass**
Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make
sure that the forward pass is correctly implemented. In `Get familiar with the original repository
<#run-a-pretrained-checkpoint-using-the-original-repository>`__, you have already created a script that runs a forward
pass of the model using the original repository. Now you should write an analogous script using the 🤗 Transformers
implementation instead of the original one. It should look as follows:
.. code:: python
model = BrandNewBertModel.from_pretrained(/path/to/converted/checkpoint/folder)
input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
output = model(input_ids).last_hidden_states
It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact
same output the very first time or that the forward pass throws an error. Don't be disappointed - it's expected! First,
you should make sure that the forward pass doesn't throw any errors. It often happens that the wrong dimensions are
used leading to a `Dimensionality mismatch` error or that the wrong data type object is used, *e.g.* ``torch.long``
instead of ``torch.float32``. Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve
certain errors.
The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are
equivalent to a precision of ``1e-3``. First, you should ensure that the output shapes are identical, *i.e.*
``outputs.shape`` should yield the same value for the script of the 🤗 Transformers implementation and the original
implementation. Next, you should make sure that the output values are identical as well. This one of the most difficult
parts of adding a new model. Common mistakes why the outputs are not identical are:
- Some layers were not added, *i.e.* an `activation` layer was not added, or the residual connection was forgotten
- The word embedding matrix was not tied
- The wrong positional embeddings are used because the original implementation uses on offset
- Dropout is applied during the forward pass. To fix this make sure `model.training is False` and that no dropout
layer is falsely activated during the forward pass, *i.e.* pass `self.training` to `PyTorch's functional dropout
<https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout>`_
The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗
Transformers implementation side-by-side and check if there are any differences. Ideally, you should debug/print out
intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗
Transformers implementation shows a different output than the original implementation. First, make sure that the
hard-coded ``input_ids`` in both scripts are identical. Next, verify that the outputs of the first transformation of
the ``input_ids`` (usually the word embeddings) are identical. And then work your way up to the very last layer of the
network. At some point, you will notice a difference between the two implementations, which should point you to the bug
in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
respectively, and to successively remove print statements showing the same values for intermediate presentions.
When you're confident that both implementations yield the same output, verifying the outputs with
``torch.allclose(original_output, output, atol=1e-3)``, you're done with the most difficult part! Congratulations - the
work left to be done should be a cakewalk 😊.
**8. Adding all necessary model tests**
At this point, you have successfully added a new model. However, it is very much possible that the model does not yet
fully comply with the required design. To make sure, the implementation is fully compatible with 🤗 Transformers, all
common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
the same ``tests/test_modeling_brand_new_bert.py``. Run this test file to verify that all common tests pass:
.. code:: python
pytest tests/test_modeling_brand_new_bert.py
Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that
-
a) The community can easily understand your work by looking at specific tests of *brand_new_bert*
-
b) Future changes to your model will not break any important feature of the model.
At first, integration tests should be added. Those integration tests essentially do the same as the debugging scripts
you used earlier to implement the model to 🤗 Transformers. A template of those model tests is already added by the
Cookiecutter, called ``BrandNewBertModelIntegrationTests`` and only has to be filled out by you. To ensure that those
tests are passing, run
.. code:: python
RUN_SLOW=1 pytest -sv tests/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
.. note::
In case you are using Windows, you should replace ``RUN_SLOW=1`` with ``SET RUN_SLOW=1``
Second, all features that are special to *brand_new_bert* should be tested additionally in a separate test under
``BrandNewBertModelTester``/``BrandNewBertModelTest``. This part is often forgotten but is extremely useful in two
ways:
- It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the
special features of *brand_new_bert* should work.
- Future contributors can quickly test changes to the model by running those special tests.
**9. Implement the tokenizer**
Next, we should add the tokenizer of *brand_new_bert*. Usually, the tokenizer is equivalent or very similar to an
already existing tokenizer of 🤗 Transformers.
It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗
Transformers' implementation of the tokenizer.
To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository
that inputs a string and returns the ``input_ids``. It could look similar to this (in pseudo-code):
.. code:: bash
input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
input_ids = model.tokenize(input_str)
You might have to take a deeper look again into the original repository to find the correct tokenizer function or you
might even have to do changes to your clone of the original repository to only output the ``input_ids``. Having written
a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be
created. It should look similar to this:
.. code:: python
from transformers import BrandNewBertTokenizer
input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
tokenizer = BrandNewBertTokenizer.from_pretrained(/path/to/tokenizer/folder/)
input_ids = tokenizer(input_str).input_ids
When both ``input_ids`` yield the same values, as a final step a tokenizer test file should also be added.
Analogous to the modeling test files of *brand_new_bert*, the tokenization test files of *brand_new_bert* should
contain a couple of hard-coded integration tests.
**10. Run End-to-end integration tests**
Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the
tokenizer to ``tests/test_modeling_brand_new_bert.py`` in 🤗 Transformers. Such a test should show on a meaningful
text-to-text sample that the 🤗 Transformers implementation works as expected. A meaningful text-to-text sample can
include *e.g.* a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none
of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. In a
final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. It can
happen that you forgot to add some ``.to(self.device)`` statements to internal tensors of the model, which in such a
test would show in an error. In case you have no access to a GPU, the Hugging Face team can take care of running those
tests for you.
**11. Add Docstring**
Now, all the necessary functionality for *brand_new_bert* is added - you're almost done! The only thing left to add is
a nice docstring and a doc page. The Cookiecutter should have added a template file called
``docs/source/model_doc/brand_new_bert.rst`` that you should fill out. Users of your model will usually first look at
this page before using your model. Hence, the documentation must be understandable and concise. It is very useful for
the community to add some *Tips* to show how the model should be used. Don't hesitate to ping the Hugging Face team
regarding the docstrings.
Next, make sure that the docstring added to ``src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`` is
correct and included all necessary inputs and outputs. It is always to good to remind oneself that documentation should
be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact
point of the community with the model.
**Code refactor**
Great, now you have added all the necessary code for *brand_new_bert*. At this point, you should correct some potential
incorrect code style by running:
.. code:: bash
make style
and verify that your coding style passes the quality check:
.. code:: bash
make quality
There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in
the tests of your pull request. This is often because of some missing information in the docstring or some incorrect
naming. The Hugging Face team will surely help you if you're stuck here.
Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. With all
tests passing, now it's a good time to go over the added code again and do some refactoring.
You have now finished the coding part, congratulation! 🎉 You are Awesome! 😎
**12. Upload the models to the model hub**
In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each
uploaded model checkpoint. You should work alongside the Hugging Face team here to decide on a fitting name for each
checkpoint and to get the required access rights to be able to upload the model under the author's organization of
*brand_new_bert*.
It is worth spending some time to create fitting model cards for each checkpoint. The model cards should highlight the
specific characteristics of this particular checkpoint, *e.g.* On which dataset was the checkpoint
pretrained/fine-tuned on? On what down-stream task should the model be used? And also include some code on how to
correctly use the model.
**13. (Optional) Add notebook**
It is very helpful to add a notebook that showcases in-detail how *brand_new_bert* can be used for inference and/or
fine-tuned on a downstream task. This is not mandatory to merge your PR, but very useful for the community.
**14. Submit your finished PR**
You're done programming now and can move to the last step, which is getting your PR merged into master. Usually, the
Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished
PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your
reviewer.
Share your work!!
-----------------------------------------------------------------------------------------------------------------------
Now, it's time to get some credit from the community for your work! Having completed a model addition is a major
contribution to Transformers and the whole NLP community. Your code and the ported pre-trained models will certainly be
used by hundreds and possibly even thousands of developers and researchers. You should be proud of your work and share
your achievement with the community.
**You have made another model that is super easy to access for everyone in the community! 🤯**

View File

@@ -1,53 +0,0 @@
# Community
This page regroups resources around 🤗 Transformers developed by the community.
## Community resources:
| Resource | Description | Author |
|:----------|:-------------|------:|
| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](https://huggingface.co/transformers/master/glossary.html) that has been put into a form which can be easily learnt/revised using [Anki ](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
## Community notebooks:
| Notebook | Description | Author | |
|:----------|:-------------|:-------------|------:|
| [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | How to train on sequences as long as 500,000 tokens with Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) |
| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) |
| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | How to build a "long" version of existing pretrained models | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune an Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|

View File

@@ -14,24 +14,21 @@
#
import os
import sys
sys.path.insert(0, os.path.abspath("../../src"))
sys.path.insert(0, os.path.abspath('../../src'))
# -- Project information -----------------------------------------------------
project = "transformers"
copyright = "2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0"
author = "huggingface"
project = u'transformers'
copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
author = u'huggingface'
# The short X.Y version
version = ""
version = u''
# The full version, including alpha/beta/rc tags
release = "4.5.0.dev0"
release = u'4.2.0'
# Prefix link to point to master, comment this during version release and uncomment below line
extlinks = {"prefix_link": ("https://github.com/huggingface/transformers/blob/master/%s", "")}
extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
# Prefix link to always point to corresponding version, uncomment this during version release
# extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/v'+ release + '/%s', '')}
@@ -45,28 +42,27 @@ extlinks = {"prefix_link": ("https://github.com/huggingface/transformers/blob/ma
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.extlinks",
"sphinx.ext.coverage",
"sphinx.ext.napoleon",
"recommonmark",
"sphinx.ext.viewcode",
"sphinx_markdown_tables",
"sphinxext.opengraph",
"sphinx_copybutton",
'sphinx.ext.autodoc',
'sphinx.ext.extlinks',
'sphinx.ext.coverage',
'sphinx.ext.napoleon',
'recommonmark',
'sphinx.ext.viewcode',
'sphinx_markdown_tables',
'sphinx_copybutton'
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_suffix = [".rst", ".md"]
source_suffix = ['.rst', '.md']
# source_suffix = '.rst'
# The master toctree document.
master_doc = "index"
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
@@ -78,7 +74,7 @@ language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None
@@ -92,30 +88,20 @@ copybutton_prompt_is_regexp = True
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {"analytics_id": "UA-83738774-2", "navigation_with_keys": True}
# Configuration for OpenGraph and Twitter Card Tags.
# These are responsible for creating nice shareable social images https://ahrefs.com/blog/open-graph-meta-tags/
# https://ogp.me/#type_website
ogp_image = "https://huggingface.co/front/thumbnails/transformers.png"
ogp_description = "State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0. Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone"
ogp_description_length = 160
ogp_custom_meta_tags = [
f'<meta name="twitter:image" content="{ogp_image}">',
f'<meta name="twitter:description" content="{ogp_description}">',
]
html_theme_options = {
'analytics_id': 'UA-83738774-2'
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
@@ -127,17 +113,17 @@ html_static_path = ["_static"]
#
# html_sidebars = {}
# This must be the name of an image file (path relative to the configuration
# directory) that is the favicon of the docs. Modern browsers use this as
# the icon for tabs, windows and bookmarks. It should be a Windows-style
# This must be the name of an image file (path relative to the configuration
# directory) that is the favicon of the docs. Modern browsers use this as
# the icon for tabs, windows and bookmarks. It should be a Windows-style
# icon file (.ico).
html_favicon = "favicon.ico"
html_favicon = 'favicon.ico'
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = "transformersdoc"
htmlhelp_basename = 'transformersdoc'
# -- Options for LaTeX output ------------------------------------------------
@@ -146,12 +132,15 @@ latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
@@ -161,7 +150,8 @@ latex_elements = {
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, "transformers.tex", "transformers Documentation", "huggingface", "manual"),
(master_doc, 'transformers.tex', u'transformers Documentation',
u'huggingface', 'manual'),
]
@@ -169,7 +159,10 @@ latex_documents = [
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, "transformers", "transformers Documentation", [author], 1)]
man_pages = [
(master_doc, 'transformers', u'transformers Documentation',
[author], 1)
]
# -- Options for Texinfo output ----------------------------------------------
@@ -178,15 +171,9 @@ man_pages = [(master_doc, "transformers", "transformers Documentation", [author]
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(
master_doc,
"transformers",
"transformers Documentation",
author,
"transformers",
"One line description of project.",
"Miscellaneous",
),
(master_doc, 'transformers', u'transformers Documentation',
author, 'transformers', 'One line description of project.',
'Miscellaneous'),
]
@@ -205,13 +192,11 @@ epub_title = project
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ["search.html"]
epub_exclude_files = ['search.html']
def setup(app):
app.add_css_file("css/huggingface.css")
app.add_css_file("css/code-snippets.css")
app.add_js_file("js/custom.js")
app.add_css_file('css/huggingface.css')
app.add_css_file('css/code-snippets.css')
app.add_js_file('js/custom.js')
# -- Extension configuration -------------------------------------------------

View File

@@ -28,13 +28,17 @@ BERT
You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
<https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the
:prefix_link:`convert_bert_original_tf_checkpoint_to_pytorch.py
<src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.
<src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.
This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated
configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights
from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that
can be imported using ``from_pretrained()`` (see example in :doc:`quicktour` , `run_glue.py
<https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_glue.py>`_\ ).
can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py
<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ ,
`run_bert_classifier.py
<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and
`run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\
).
You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\
@@ -62,7 +66,7 @@ ALBERT
Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
:prefix_link:`convert_albert_original_tf_checkpoint_to_pytorch.py
<src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py>` script.
<src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.
The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying
configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you
@@ -164,18 +168,3 @@ Here is an example of the conversion process for a pre-trained XLM model:
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
[--config XML_CONFIG] \
[--finetuning_task_name XML_FINETUNED_TASK]
T5
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here is an example of the conversion process for a pre-trained T5 model:
.. code-block:: shell
export T5=/path/to/t5/uncased_L-12_H-768_A-12
transformers-cli convert --model_type t5 \
--tf_checkpoint $T5/t5_model.ckpt \
--config $T5/t5_config.json \
--pytorch_dump_output $T5/pytorch_model.bin

View File

@@ -15,10 +15,10 @@ Fine-tuning with custom datasets
.. note::
The datasets used in this tutorial are available and can be more easily accessed using the `🤗 Datasets library
<https://github.com/huggingface/datasets>`_. We do not use this library to access the datasets here since this
tutorial meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the
tutorial in the section ":ref:`datasetslib`".
The datasets used in this tutorial are available and can be more easily accessed using the `🤗 NLP library
<https://github.com/huggingface/nlp>`_. We do not use this library to access the datasets here since this tutorial
meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the tutorial
in the section ":ref:`nlplib`".
This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The guide
shows one of many valid workflows for using these models and is meant to be illustrative rather than definitive. We
@@ -41,7 +41,7 @@ Sequence Classification with IMDb Reviews
.. note::
This dataset can be explored in the Hugging Face model hub (`IMDb <https://huggingface.co/datasets/imdb>`_), and
can be alternatively downloaded with the 🤗 Datasets library with ``load_dataset("imdb")``.
can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.
In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task takes
the text of a review and requires the model to predict whether the sentiment of the review is positive or negative.
@@ -75,7 +75,7 @@ read this in.
test_texts, test_labels = read_imdb_split('aclImdb/test')
We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation
and tuning without tainting our test set results. Sklearn has a convenient utility for creating such splits:
and tuning without training our test set results. Sklearn has a convenient utility for creating such splits:
.. code-block:: python
@@ -260,7 +260,7 @@ Token Classification with W-NUT Emerging Entities
.. note::
This dataset can be explored in the Hugging Face model hub (`WNUT-17 <https://huggingface.co/datasets/wnut_17>`_),
and can be alternatively downloaded with the 🤗 Datasets library with ``load_dataset("wnut_17")``.
and can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.
Next we will look at token classification. Rather than classifying an entire sequence, this task classifies token by
token. We'll demonstrate how to do this with `Named Entity Recognition
@@ -459,7 +459,7 @@ Question Answering with SQuAD 2.0
.. note::
This dataset can be explored in the Hugging Face model hub (`SQuAD V2
<https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 Datasets library with
<https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 NLP library with
``load_dataset("squad_v2")``.
Question answering comes in many forms. In this example, we'll look at the particular type of extractive QA that
@@ -558,14 +558,15 @@ we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method
end_positions = []
for i in range(len(answers)):
start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
# if start position is None, the answer passage has been truncated
if start_positions[-1] is None:
start_positions[-1] = tokenizer.model_max_length
if end_positions[-1] is None:
end_positions[-1] = tokenizer.model_max_length
# if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
if end_positions[-1] is None:
end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1)
encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
add_token_positions(train_encodings, train_answers)
@@ -677,23 +678,22 @@ Additional Resources
- :doc:`Preprocessing <preprocessing>`. Docs page on data preprocessing.
- :doc:`Training <training>`. Docs page on training and fine-tuning.
.. _datasetslib:
.. _nlplib:
Using the 🤗 Datasets & Metrics library
Using the 🤗 NLP Datasets & Metrics library
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with 🤗
Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the `🤗
Datasets library <https://github.com/huggingface/datasets>`_ for working with the 150+ datasets included in the `hub
NLP library <https://github.com/huggingface/nlp>`_ for working with the 150+ datasets included in the `hub
<https://huggingface.co/datasets>`_, including the three datasets used in this tutorial. As a very brief overview, we
will show how to use the Datasets library to download and prepare the IMDb dataset from the first example,
:ref:`seq_imdb`.
will show how to use the NLP library to download and prepare the IMDb dataset from the first example, :ref:`seq_imdb`.
Start by downloading the dataset:
.. code-block:: python
from datasets import load_dataset
from nlp import load_dataset
train = load_dataset("imdb", split="train")
Each dataset has multiple columns corresponding to different features. Let's see what our columns are.
@@ -725,5 +725,5 @@ dataset elements.
>>> {key: val.shape for key, val in train[0].items()})
{'labels': TensorShape([]), 'input_ids': TensorShape([512]), 'attention_mask': TensorShape([512])}
We now have a fully-prepared dataset. Check out `the 🤗 Datasets docs
<https://huggingface.co/docs/datasets/processing.html>`_ for a more thorough introduction.
We now have a fully-prepared dataset. Check out `the 🤗 NLP docs <https://huggingface.co/nlp/processing.html>`_ for a
more thorough introduction.

View File

@@ -1,62 +0,0 @@
Using tokenizers from 🤗 Tokenizers
=======================================================================================================================
The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers
<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be
loaded very simply into 🤗 Transformers.
Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
.. code-block::
>>> from tokenizers import Tokenizer
>>> from tokenizers.models import BPE
>>> from tokenizers.trainers import BpeTrainer
>>> from tokenizers.pre_tokenizers import Whitespace
>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
>>> tokenizer.pre_tokenizer = Whitespace()
>>> files = [...]
>>> tokenizer.train(files, trainer)
We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
a JSON file for future re-use.
Loading directly from the tokenizer object
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
:class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated
`tokenizer` object as an argument:
.. code-block::
>>> from transformers import PreTrainedTokenizerFast
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
page <main_classes/tokenizer>` for more information.
Loading from a JSON file
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
.. code-block::
>>> tokenizer.save("tokenizer.json")
The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization
method using the :obj:`tokenizer_file` parameter:
.. code-block::
>>> from transformers import PreTrainedTokenizerFast
>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
page <main_classes/tokenizer>` for more information.

View File

@@ -21,25 +21,22 @@ General terms
- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
tokens at a certain timestep.
- deep learning: machine learning algorithms which uses neural networks with several layers.
- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
by masking some tokens randomly, and has to predict the original text.
- multimodal: a task that combines texts with another kind of inputs (for instance images).
- NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
translation).
- NLG: natural language generation, all tasks related to generating text ( for instance talk with transformers,
translation)
- NLP: natural language processing, a generic way to say "deal with texts".
- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
the whole text, individual words).
the whole text, individual words)
- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
masking some words and trying to predict them (see MLM).
- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
- self-attention: each element of the input finds out which other elements of the input they should attend to.
- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
or a punctuation symbol.
- transformer: self-attention based deep learning model architecture.
Model inputs
-----------------------------------------------------------------------------------------------------------------------

Binary file not shown.

Before

Width:  |  Height:  |  Size: 691 KiB

View File

@@ -97,141 +97,108 @@ and conversion utilities for the following models:
5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
Narayan, Aliaksei Severyn.
6. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
7. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
6. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
8. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
7. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
9. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
<https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
10. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
11. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
12. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
Lav R. Varshney, Caiming Xiong and Richard Socher.
13. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
Chen.
14. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
8. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
9. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
Lav R. Varshney, Caiming Xiong and Richard Socher.
10. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
Weizhu Chen.
15. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
11. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
16. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
12. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
<https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
<https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
`DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
version of DistilBERT.
17. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
13. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
18. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
14. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
19. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
15. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
20. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
16. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
21. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
17. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
and Ilya Sutskever.
22. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
18. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
Luan, Dario Amodei** and Ilya Sutskever**.
23. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
<https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
24. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
<https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
25. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
19. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
26. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
20. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
<https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
27. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
21. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
28. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
22. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
by Hao Tan and Mohit Bansal.
29. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
30. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
23. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
Translator Team.
31. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
24. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
32. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
33. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
25. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
Jianfeng Lu, Tie-Yan Liu.
34. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
26. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
35. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
27. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
Mohammad Saleh and Peter J. Liu.
36. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
28. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
37. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
29. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
38. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
30. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
39. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
`fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
40. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT
<https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German version of
DistilBERT.
31. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
Krishna, and Kurt W. Keutzer.
41. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
32. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
42. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
33. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
Francesco Piccinno and Julian Martin Eisenschlos.
43. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
34. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
44. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
45. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
Zhou, Abdelrahman Mohamed, Michael Auli.
46. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
35. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
47. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
36. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
48. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
37. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
Zettlemoyer and Veselin Stoyanov.
49. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
38. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
50. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
.. _bigtable:
@@ -256,8 +223,6 @@ TensorFlow and/or Flax.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Bert Generation | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BigBird | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Blenderbot | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BlenderbotSmall | ✅ | ❌ | ✅ | ✅ | ❌ |
@@ -266,14 +231,10 @@ TensorFlow and/or Flax.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| CamemBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| ConvBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DPR | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DeBERTa | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DeBERTa-v2 | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| DistilBERT | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| ELECTRA | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -286,20 +247,14 @@ TensorFlow and/or Flax.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Funnel Transformer | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| GPT Neo | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| I-BERT | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| LED | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| LXMERT | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| LayoutLM | ✅ | ✅ | ✅ | | ❌ |
| LayoutLM | ✅ | ✅ | ✅ | | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Longformer | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| M2M100 | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| MPNet | ✅ | ✅ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Marian | ✅ | ❌ | ✅ | ✅ | ❌ |
@@ -314,7 +269,7 @@ TensorFlow and/or Flax.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| ProphetNet | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| RAG | ✅ | ❌ | ✅ | | ❌ |
| RAG | ✅ | ❌ | ✅ | | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Reformer | ✅ | ✅ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -322,8 +277,6 @@ TensorFlow and/or Flax.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Speech2Text | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| SqueezeBERT | ✅ | ✅ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| T5 | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -332,10 +285,6 @@ TensorFlow and/or Flax.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| ViT | ❌ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Wav2Vec2 | ✅ | ❌ | ✅ | ❌ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| XLM | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| XLM-RoBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -378,13 +327,9 @@ TensorFlow and/or Flax.
examples
custom_datasets
notebooks
sagemaker
community
converting_tensorflow_models
migration
contributing
add_new_model
fast_tokenizers
testing
serialization
@@ -410,7 +355,6 @@ TensorFlow and/or Flax.
main_classes/processors
main_classes/tokenizer
main_classes/trainer
main_classes/feature_extractor
.. toctree::
:maxdepth: 2
@@ -423,15 +367,11 @@ TensorFlow and/or Flax.
model_doc/bert
model_doc/bertweet
model_doc/bertgeneration
model_doc/bigbird
model_doc/blenderbot
model_doc/blenderbot_small
model_doc/bort
model_doc/camembert
model_doc/convbert
model_doc/ctrl
model_doc/deberta
model_doc/deberta_v2
model_doc/dialogpt
model_doc/distilbert
model_doc/dpr
@@ -441,20 +381,17 @@ TensorFlow and/or Flax.
model_doc/fsmt
model_doc/funnel
model_doc/herbert
model_doc/ibert
model_doc/layoutlm
model_doc/led
model_doc/longformer
model_doc/lxmert
model_doc/marian
model_doc/m2m_100
model_doc/mbart
model_doc/mobilebert
model_doc/mpnet
model_doc/mt5
model_doc/gpt
model_doc/gpt2
model_doc/gpt_neo
model_doc/pegasus
model_doc/phobert
model_doc/prophetnet
@@ -462,18 +399,14 @@ TensorFlow and/or Flax.
model_doc/reformer
model_doc/retribert
model_doc/roberta
model_doc/speech_to_text
model_doc/squeezebert
model_doc/t5
model_doc/tapas
model_doc/transformerxl
model_doc/vit
model_doc/wav2vec2
model_doc/xlm
model_doc/xlmprophetnet
model_doc/xlmroberta
model_doc/xlnet
model_doc/xlsr_wav2vec2
.. toctree::
:maxdepth: 2
@@ -484,4 +417,3 @@ TensorFlow and/or Flax.
internal/tokenization_utils
internal/trainer_utils
internal/generation_utils
internal/file_utils

View File

@@ -19,7 +19,7 @@ limitations under the License.
🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.
You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
to use and activate it.
Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you
@@ -28,8 +28,8 @@ must install it from source.
## Installation with pip
First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available),
[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or
Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available),
[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or
[Flax installation page](https://github.com/google/flax#quick-install)
regarding the specific install command for your platform.
@@ -73,27 +73,7 @@ It should download a pretrained model then print something like
## Installing from source
Here is how to quickly install `transformers` from source:
```bash
pip install git+https://github.com/huggingface/transformers
```
Note that this will install not the latest released version, but the bleeding edge `master` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't been yet rolled out.
While we strive to keep `master` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/transformers/issues) and this way, things will get fixed even sooner.
Again, you can run:
```bash
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
```
to check 🤗 Transformers is properly installed.
## Editable install
If you want to constantly use the bleeding edge `master` version of the source code, or if you want to contribute to the library and need to test the changes in the code you're making, you will need an editable install. This is done by cloning the repository and installing with the following commands:
To install from source, clone the repository and install with the following commands:
``` bash
git clone https://github.com/huggingface/transformers.git
@@ -101,22 +81,13 @@ cd transformers
pip install -e .
```
This command performs a magical link between the folder you cloned the repository to and your python library paths, and it'll look inside this folder in addition to the normal library-wide paths. So if normally your python packages get installed into:
```
~/anaconda3/envs/main/lib/python3.7/site-packages/
```
now this editable install will reside where you clone the folder to, e.g. `~/transformers/` and python will search it too.
Again, you can run
Do note that you have to keep that `transformers` folder around and not delete it to continue using the `transfomers` library.
Now, let's get to the real benefit of this installation approach. Say, you saw some new feature has been just committed into `master`. If you have already performed all the steps above, to update your transformers to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
```
cd ~/transformers/
git pull
```bash
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
```
There is nothing else to do. Your python environment will find the bleeding edge version of `transformers` on the next run.
to check 🤗 Transformers is properly installed.
## With conda
@@ -129,7 +100,7 @@ Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
conda install -c huggingface transformers
```
Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
## Caching models
@@ -138,7 +109,7 @@ This library provides pretrained models that will be downloaded and cached local
folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
Face cache home followed by ``/transformers/``. This is (by order of priority):
* shell environment variable ``HF_HOME``
* shell environment variable ``HF_HOME``
* shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
* default: ``~/.cache/huggingface/``
@@ -151,40 +122,15 @@ environment variable for ``TRANSFORMERS_CACHE``.
### Note on model downloads (Continuous Integration or large-scale deployments)
If you expect to be downloading large volumes of models (more than 10,000) from huggingface.co (for instance through
If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through
your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
faster, and cheaper. Feel free to contact us privately, we'd love to help with this.
### Offline mode
It's possible to run 🤗 Transformers in a firewalled or a no-network environment.
Setting environment variable `TRANSFORMERS_OFFLINE=1` will tell 🤗 Transformers to use local files only and will not try to look things up.
Most likely you may want to couple this with `HF_DATASETS_OFFLINE=1` that performs the same for 🤗 Datasets if you're using the latter.
Here is an example of how this can be used on a filesystem that is shared between a normally networked and a firewalled to the external world instances.
On the instance with the normal network run your program which will download and cache models (and optionally datasets if you use 🤗 Datasets). For example:
```
python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
```
and then with the same filesystem you can now run the same program on a firewalled instance:
```
HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
```
and it should succeed without any hanging waiting to timeout.
faster, and cheaper. Feel free to contact us privately if you need any help.
## Do you want to run a Transformer model on a mobile device?
You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or

View File

@@ -1,54 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
General Utilities
-----------------------------------------------------------------------------------------------------------------------
This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
Most of those are only useful if you are studying the general code in the library.
Enums and namedtuples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils.ExplicitEnum
.. autoclass:: transformers.file_utils.PaddingStrategy
.. autoclass:: transformers.file_utils.TensorType
Special Decorators
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: transformers.file_utils.add_start_docstrings
.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
.. autofunction:: transformers.file_utils.add_end_docstrings
.. autofunction:: transformers.file_utils.add_code_sample_docstrings
.. autofunction:: transformers.file_utils.replace_return_docstrings
Special Properties
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils.cached_property
Other Utilities
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils._BaseLazyModule

View File

@@ -151,33 +151,6 @@ generation.
.. autoclass:: transformers.HammingDiversityLogitsProcessor
:members: __call__
.. autoclass:: transformers.ForcedBOSTokenLogitsProcessor
:members: __call__
.. autoclass:: transformers.ForcedEOSTokenLogitsProcessor
:members: __call__
.. autoclass:: transformers.InfNanRemoveLogitsProcessor
:members: __call__
StoppingCriteria
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A :class:`~transformers.StoppingCriteria` can be used to change when to stop generation (other than EOS token).
.. autoclass:: transformers.StoppingCriteria
:members: __call__
.. autoclass:: transformers.StoppingCriteriaList
:members: __call__
.. autoclass:: transformers.MaxLengthCriteria
:members: __call__
.. autoclass:: transformers.MaxTimeCriteria
:members: __call__
BeamSearch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -47,4 +47,6 @@ Data format
Utilities
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: transformers.pipelines.get_framework
.. autoclass:: transformers.pipelines.PipelineException

View File

@@ -38,6 +38,12 @@ SpecialTokensMixin
Enums and namedtuples
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
.. autoclass:: transformers.tokenization_utils_base.TensorType
.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
.. autoclass:: transformers.tokenization_utils_base.CharSpan

View File

@@ -22,7 +22,7 @@ Utilities
.. autoclass:: transformers.EvalPrediction
.. autoclass:: transformers.IntervalStrategy
.. autoclass:: transformers.EvaluationStrategy
.. autofunction:: transformers.set_seed

View File

@@ -74,32 +74,6 @@ TrainerCallback
.. autoclass:: transformers.TrainerCallback
:members:
Here is an example of how to register a custom callback with the PyTorch :class:`~transformers.Trainer`:
.. code-block:: python
class MyCallback(TrainerCallback):
"A callback that prints a message at the beginning of training"
def on_train_begin(self, args, state, control, **kwargs):
print("Starting training")
trainer = Trainer(
model,
args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
callbacks=[MyCallback] # We can either pass the callback class this way or an instance of it (MyCallback())
)
Another way to register a callback is to call ``trainer.add_callback()`` as follows:
.. code-block:: python
trainer = Trainer(...)
trainer.add_callback(MyCallback)
# Alternatively, we can pass an instance of the callback class
trainer.add_callback(MyCallback())
TrainerState
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -1,48 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
Feature Extractor
-----------------------------------------------------------------------------------------------------------------------
A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
tensors.
FeatureExtractionMixin
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.feature_extraction_utils.FeatureExtractionMixin
:members: from_pretrained, save_pretrained
SequenceFeatureExtractor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.SequenceFeatureExtractor
:members: pad
BatchFeature
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BatchFeature
:members:
ImageFeatureExtractionMixin
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.image_utils.ImageFeatureExtractionMixin
:members:

View File

@@ -65,10 +65,6 @@ Other functions
.. autofunction:: transformers.logging.get_logger
.. autofunction:: transformers.logging.enable_default_handler
.. autofunction:: transformers.logging.disable_default_handler
.. autofunction:: transformers.logging.enable_explicit_format
.. autofunction:: transformers.logging.reset_format

View File

@@ -60,7 +60,7 @@ ModelOutput
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils.ModelOutput
:members: to_tuple
:members:
BaseModelOutput

View File

@@ -54,24 +54,15 @@ PreTrainedTokenizer
.. autoclass:: transformers.PreTrainedTokenizer
:special-members: __call__
:members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode,
get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add, prepare_for_tokenization, tokenize,
vocab_size
:members:
PreTrainedTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The :class:`~transformers.PreTrainedTokenizerFast` depend on the `tokenizers
<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 tokenizers library can be
loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokenizers from 🤗 tokenizers
<../fast_tokenizers>` page to understand how this is done.
.. autoclass:: transformers.PreTrainedTokenizerFast
:special-members: __call__
:members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode,
get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add,
set_truncation_and_padding,tokenize, vocab_size
:members:
BatchEncoding

View File

@@ -21,20 +21,17 @@ Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.T
customization during training.
The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
<https://github.com/NVIDIA/apex>`__ and Native AMP for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
<https://github.com/NVIDIA/apex>`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop which supports
the above features. To inject custom behavior you can subclass them and override the following methods:
Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop supporting the
previous features. To inject custom behavior you can subclass them and override the following methods:
- **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaluation DataLoader (PyTorch) or TF Dataset.
- **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
- **log** -- Logs information on the various objects watching training.
- **create_optimizer_and_scheduler** -- Sets up the optimizer and learning rate scheduler if they were not passed at
init. Note, that you can also subclass or override the ``create_optimizer`` and ``create_scheduler`` methods
separately.
- **create_optimizer** -- Sets up the optimizer if it wasn't passed at init.
- **create_scheduler** -- Sets up the learning rate scheduler if it wasn't passed at init.
- **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
init.
- **compute_loss** - Computes the loss on a batch of training inputs.
- **training_step** -- Performs a training step.
- **prediction_step** -- Performs an evaluation/test step.
@@ -42,35 +39,17 @@ the above features. To inject custom behavior you can subclass them and override
- **evaluate** -- Runs an evaluation loop and returns metrics.
- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
.. warning::
The :class:`~transformers.Trainer` class is optimized for 🤗 Transformers models and can have surprising behaviors
when you use it on other models. When using it on your own model, make sure:
- your model always return tuples or subclasses of :class:`~transformers.file_utils.ModelOutput`.
- your model can compute the loss if a :obj:`labels` argument is provided and that loss is returned as the first
element of the tuple (if your model returns tuples)
- your model can accept multiple label arguments (use the :obj:`label_names` in your
:class:`~transformers.TrainingArguments` to indicate their name to the :class:`~transformers.Trainer`) but none
of them should be named :obj:`"label"`.
Here is an example of how to customize :class:`~transformers.Trainer` using a custom loss function for multi-label
classification:
Here is an example of how to customize :class:`~transformers.Trainer` using a custom loss function:
.. code-block:: python
import torch
from transformers import Trainer
class MultilabelTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
class MyTrainer(Trainer):
def compute_loss(self, model, inputs):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.logits
loss_fct = torch.nn.BCEWithLogitsLoss()
loss = loss_fct(logits.view(-1, self.model.config.num_labels),
labels.float().view(-1, self.model.config.num_labels))
return (loss, outputs) if return_outputs else loss
logits = outputs[0]
return my_custom_loss(logits, labels)
Another way to customize the training loop behavior for the PyTorch :class:`~transformers.Trainer` is to use
:doc:`callbacks <callback>` that can inspect the training loop state (for progress reporting, logging on TensorBoard or
@@ -134,125 +113,7 @@ Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, O
This provided support is new and experimental as of this writing.
Installation Notes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
As of this writing, both FairScale and Deepspeed require compilation of CUDA C++ code, before they can be used.
While all installation issues should be dealt with through the corresponding GitHub Issues of `FairScale
<https://github.com/facebookresearch/fairscale/issues>`__ and `Deepspeed
<https://github.com/microsoft/DeepSpeed/issues>`__, there are a few common issues that one may encounter while building
any PyTorch extension that needs to build CUDA extensions.
Therefore, if you encounter a CUDA-related build issue while doing one of the following or both:
.. code-block:: bash
pip install fairscale
pip install deepspeed
please, read the following notes first.
In these notes we give examples for what to do when ``pytorch`` has been built with CUDA ``10.2``. If your situation is
different remember to adjust the version number to the one you are after.
**Possible problem #1:**
While, Pytorch comes with its own CUDA toolkit, to build these two projects you must have an identical version of CUDA
installed system-wide.
For example, if you installed ``pytorch`` with ``cudatoolkit==10.2`` in the Python environment, you also need to have
CUDA ``10.2`` installed system-wide.
The exact location may vary from system to system, but ``/usr/local/cuda-10.2`` is the most common location on many
Unix systems. When CUDA is correctly set up and added to the ``PATH`` environment variable, one can find the
installation location by doing:
.. code-block:: bash
which nvcc
If you don't have CUDA installed system-wide, install it first. You will find the instructions by using your favorite
search engine. For example, if you're on Ubuntu you may want to search for: `ubuntu cuda 10.2 install
<https://www.google.com/search?q=ubuntu+cuda+10.2+install>`__.
**Possible problem #2:**
Another possible common problem is that you may have more than one CUDA toolkit installed system-wide. For example you
may have:
.. code-block:: bash
/usr/local/cuda-10.2
/usr/local/cuda-11.0
Now, in this situation you need to make sure that your ``PATH`` and ``LD_LIBRARY_PATH`` environment variables contain
the correct paths to the desired CUDA version. Typically, package installers will set these to contain whatever the
last version was installed. If you encounter the problem, where the package build fails because it can't find the right
CUDA version despite you having it installed system-wide, it means that you need to adjust the 2 aforementioned
environment variables.
First, you may look at their contents:
.. code-block:: bash
echo $PATH
echo $LD_LIBRARY_PATH
so you get an idea of what is inside.
It's possible that ``LD_LIBRARY_PATH`` is empty.
``PATH`` lists the locations of where executables can be found and ``LD_LIBRARY_PATH`` is for where shared libraries
are to looked for. In both cases, earlier entries have priority over the later ones. ``:`` is used to separate multiple
entries.
Now, to tell the build program where to find the specific CUDA toolkit, insert the desired paths to be listed first by
doing:
.. code-block:: bash
export PATH=/usr/local/cuda-10.2/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
Note that we aren't overwriting the existing values, but prepending instead.
Of course, adjust the version number, the full path if need be. Check that the directories you assign actually do
exist. ``lib64`` sub-directory is where the various CUDA ``.so`` objects, like ``libcudart.so`` reside, it's unlikely
that your system will have it named differently, but if it is adjust it to reflect your reality.
**Possible problem #3:**
Some older CUDA versions may refuse to build with newer compilers. For example, you my have ``gcc-9`` but it wants
``gcc-7``.
There are various ways to go about it.
If you can install the latest CUDA toolkit it typically should support the newer compiler.
Alternatively, you could install the lower version of the compiler in addition to the one you already have, or you may
already have it but it's not the default one, so the build system can't see it. If you have ``gcc-7`` installed but the
build system complains it can't find it, the following might do the trick:
.. code-block:: bash
sudo ln -s /usr/bin/gcc-7 /usr/local/cuda-10.2/bin/gcc
sudo ln -s /usr/bin/g++-7 /usr/local/cuda-10.2/bin/g++
Here, we are making a symlink to ``gcc-7`` from ``/usr/local/cuda-10.2/bin/gcc`` and since
``/usr/local/cuda-10.2/bin/`` should be in the ``PATH`` environment variable (see the previous problem's solution), it
should find ``gcc-7`` (and ``g++7``) and then the build will succeed.
As always make sure to edit the paths in the example to match your situation.
**If still unsuccessful:**
If after addressing these you still encounter build issues, please, proceed with the GitHub Issue of `FairScale
<https://github.com/facebookresearch/fairscale/issues>`__ and `Deepspeed
<https://github.com/microsoft/DeepSpeed/issues>`__, depending on the project you have the problem with.
You will need at least 2 GPUs to benefit from these features.
FairScale
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -262,10 +123,6 @@ provides support for the following features from `the ZeRO paper <https://arxiv.
1. Optimizer State Sharding
2. Gradient Sharding
3. Model Parameters Sharding (new and very experimental)
4. CPU offload (new and very experimental)
You will need at least two GPUs to use this feature.
To deploy this feature:
@@ -275,349 +132,113 @@ To deploy this feature:
pip install fairscale
or find more details on `the FairScale's GitHub page
or find more details on `the FairScale's github page
<https://github.com/facebookresearch/fairscale/#installation>`__.
2. To use the first version of Sharded data-parallelism, add ``--sharded_ddp simple`` to the command line arguments,
and make sure you have added the distributed launcher ``-m torch.distributed.launch
--nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
2. Add ``--sharded_ddp`` to the command line arguments, and make sure you have added the distributed launcher ``-m
torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
For example here is how you could use it for ``run_translation.py`` with 2 GPUs:
For example here is how you could use it for ``finetune_trainer.py`` with 2 GPUs:
.. code-block:: bash
python -m torch.distributed.launch --nproc_per_node=2 examples/seq2seq/run_translation.py \
--model_name_or_path t5-small --per_device_train_batch_size 1 \
cd examples/seq2seq
python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
--model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
--output_dir output_dir --overwrite_output_dir \
--do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro \
--fp16 --sharded_ddp simple
--do_train --n_train 500 --num_train_epochs 1 \
--per_device_train_batch_size 1 --freeze_embeds \
--src_lang en_XX --tgt_lang ro_RO --task translation \
--fp16 --sharded_ddp
Notes:
- This feature requires distributed training (so multiple GPUs).
- It is not implemented for TPUs.
- It works with ``--fp16`` too, to make things even faster.
- One of the main benefits of enabling ``--sharded_ddp simple`` is that it uses a lot less GPU memory, so you should be
able to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
- One of the main benefits of enabling ``--sharded_ddp`` is that it uses a lot less GPU memory, so you should be able
to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
significantly shorter training time.
3. To use the second version of Sharded data-parallelism, add ``--sharded_ddp zero_dp_2`` or ``--sharded_ddp zero_dp_3`
to the command line arguments, and make sure you have added the distributed launcher ``-m torch.distributed.launch
--nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
For example here is how you could use it for ``run_translation.py`` with 2 GPUs:
.. code-block:: bash
python -m torch.distributed.launch --nproc_per_node=2 examples/seq2seq/run_translation.py \
--model_name_or_path t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir \
--do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro \
--fp16 --sharded_ddp zero_dp_2
:obj:`zero_dp_2` is an optimized version of the simple wrapper, while :obj:`zero_dp_3` fully shards model weights,
gradients and optimizer states.
Both are compatible with adding :obj:`cpu_offload` to enable ZeRO-offload (activate it like this: :obj:`--sharded_ddp
"zero_dp_2 cpu_offload"`).
Notes:
- This feature requires distributed training (so multiple GPUs).
- It is not implemented for TPUs.
- It works with ``--fp16`` too, to make things even faster.
- The ``cpu_offload`` additional option requires ``--fp16``.
- This is an area of active development, so make sure you have a source install of fairscale to use this feature as
some bugs you encounter may have been fixed there already.
Known caveats:
- This feature is incompatible with :obj:`--predict_with_generate` in the `run_translation.py` script.
- Using :obj:`--sharded_ddp zero_dp_3` requires wrapping each layer of the model in the special container
:obj:`FullyShardedDataParallelism` of fairscale. It should be used with the option :obj:`auto_wrap` if you are not
doing this yourself: :obj:`--sharded_ddp "zero_dp_3 auto_wrap"`.
DeepSpeed
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
`DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ implements everything described in the `ZeRO paper
<https://arxiv.org/abs/1910.02054>`__, except ZeRO's stage 3. "Parameter Partitioning (Pos+g+p)". Currently it provides
full support for:
1. Optimizer State Partitioning (ZeRO stage 1)
2. Add Gradient Partitioning (ZeRO stage 2)
3. Custom fp16 handling
4. A range of fast Cuda-extension-based Optimizers
5. ZeRO-Offload
ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training
<https://arxiv.org/abs/2101.06840>`__.
To deploy this feature:
DeepSpeed is currently used only for training, as all the currently available features are of no use to inference.
Installation
=======================================================================================================================
Install the library via pypi:
.. code-block:: bash
pip install deepspeed
or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__.
Deployment with multiple GPUs
=======================================================================================================================
To deploy this feature with multiple GPUs adjust the :class:`~transformers.Trainer` command line arguments as
following:
1. replace ``python -m torch.distributed.launch`` with ``deepspeed``.
2. add a new argument ``--deepspeed ds_config.json``, where ``ds_config.json`` is the DeepSpeed configuration file as
documented `here <https://www.deepspeed.ai/docs/config-json/>`__. The file naming is up to you.
Therefore, if your original command line looked as following:
.. code-block:: bash
python -m torch.distributed.launch --nproc_per_node=2 your_program.py <normal cl args>
Now it should be:
.. code-block:: bash
deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to use with ``--nproc_per_node``, with the
``deepspeed`` launcher you don't have to use the corresponding ``--num_gpus`` if you want all of your GPUs used. The
full details on how to configure various nodes and GPUs can be found `here
<https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__.
In fact, you can continue using ``-m torch.distributed.launch`` with DeepSpeed as long as you don't need to use
``deepspeed`` launcher-specific arguments. Typically if you don't need a multi-node setup you're not required to use
the ``deepspeed`` launcher. But since in the DeepSpeed documentation it'll be used everywhere, for consistency we will
use it here as well.
Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs:
.. code-block:: bash
deepspeed examples/seq2seq/run_translation.py \
--deepspeed examples/tests/deepspeed/ds_config.json \
--model_name_or_path t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro
Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` - i.e.
two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments to deal
with, we combined the two into a single argument.
For some practical usage examples, please, see this `post
<https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400>`__.
Deployment with one GPU
=======================================================================================================================
To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` command line arguments as following:
.. code-block:: bash
deepspeed --num_gpus=1 examples/seq2seq/run_translation.py \
--deepspeed examples/tests/deepspeed/ds_config.json \
--model_name_or_path t5-small --per_device_train_batch_size 1 \
--output_dir output_dir --overwrite_output_dir --fp16 \
--do_train --max_train_samples 500 --num_train_epochs 1 \
--dataset_name wmt16 --dataset_config "ro-en" \
--source_lang en --target_lang ro
This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU. By default,
DeepSpeed deploys all GPUs it can see. If you have only 1 GPU to start with, then you don't need this argument. The
following `documentation <https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__ discusses the
launcher options.
Why would you want to use DeepSpeed with just one GPU?
1. It has a ZeRO-offload feature which can delegate some computations and memory to the host's CPU and RAM, and thus
leave more GPU resources for model's needs - e.g. larger batch size, or enabling a fitting of a very big model which
normally won't fit.
2. It provides a smart GPU memory management system, that minimizes memory fragmentation, which again allows you to fit
bigger models and data batches.
While we are going to discuss the configuration in details next, the key to getting a huge improvement on a single GPU
with DeepSpeed is to have at least the following configuration in the configuration file:
.. code-block:: json
{
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"overlap_comm": true,
"contiguous_gradients": true,
"cpu_offload": true
},
}
which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will
find more details in the discussion below.
For a practical usage example of this type of deployment, please, see this `post
<https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685>`__.
Notes:
- if you need to run on a specific GPU, which is different from GPU 0, you can't use ``CUDA_VISIBLE_DEVICES`` to limit
the visible scope of available GPUs. Instead, you have to use the following syntax:
1. Install the library via pypi:
.. code-block:: bash
deepspeed --include localhost:1 examples/seq2seq/run_translation.py ...
pip install deepspeed
In this example, we tell DeepSpeed to use GPU 1 (second gpu).
or find more details on `the DeepSpeed's github page <https://github.com/microsoft/deepspeed#installation>`__.
2. Adjust the :class:`~transformers.Trainer` command line arguments as following:
1. replace ``python -m torch.distributed.launch`` with ``deepspeed``.
2. add a new argument ``--deepspeed ds_config.json``, where ``ds_config.json`` is the DeepSpeed configuration file
as documented `here <https://www.deepspeed.ai/docs/config-json/>`__. The file naming is up to you.
Deployment in Notebooks
=======================================================================================================================
Therefore, if your original command line looked as following:
The problem with running notebook cells as a script is that there is no normal ``deepspeed`` launcher to rely on, so
under certain setups we have to emulate it.
.. code-block:: bash
Here is how you'd have to adjust your training code in the notebook to use DeepSpeed.
python -m torch.distributed.launch --nproc_per_node=2 your_program.py <normal cl args>
.. code-block:: python
Now it should be:
# DeepSpeed requires a distributed environment even when only one process is used.
# This emulates a launcher in the notebook
import os
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"
os.environ['WORLD_SIZE'] = "1"
.. code-block:: bash
# Now proceed as normal, plus pass the deepspeed config file
training_args = TrainingArguments(..., deepspeed="ds_config.json")
trainer = Trainer(...)
trainer.train()
deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
Note: `...` stands for the normal arguments that you'd pass to the functions.
Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to use with ``--nproc_per_node``, with
the ``deepspeed`` launcher you don't have to use the corresponding ``--num_gpus`` if you want all of your GPUs used.
The full details on how to configure various nodes and GPUs can be found `here
<https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__.
If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated
cell with:
Here is an example of running ``finetune_trainer.py`` under DeepSpeed deploying all available GPUs:
.. code-block:: python
.. code-block:: bash
%%bash
cat <<'EOT' > ds_config.json
{
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
cd examples/seq2seq
deepspeed ./finetune_trainer.py --deepspeed ds_config.json \
--model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
--output_dir output_dir --overwrite_output_dir \
--do_train --n_train 500 --num_train_epochs 1 \
--per_device_train_batch_size 1 --freeze_embeds \
--src_lang en_XX --tgt_lang ro_RO --task translation
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 2e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 2e8,
"contiguous_gradients": true,
"cpu_offload": true
},
Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` -
i.e. two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments
to deal with, we combined the two into a single argument.
"optimizer": {
"type": "AdamW",
"params": {
"lr": 3e-5,
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
},
Before you can deploy DeepSpeed, let's discuss its configuration.
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 3e-5,
"warmup_num_steps": 500
}
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
That's said if the script is not in the notebook cells, you can launch ``deepspeed`` normally via shell from a cell
with:
.. code-block::
!deepspeed examples/seq2seq/run_translation.py ...
or with bash magic, where you can write a multi-line code for the shell to run:
.. code-block::
%%bash
cd /somewhere
deepspeed examples/seq2seq/run_translation.py ...
Configuration
=======================================================================================================================
**Configuration:**
For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer
to the `following documentation <https://www.deepspeed.ai/docs/config-json/>`__.
You can find dozens of DeepSpeed configuration examples that address various practical needs in `the DeepSpeedExamples
repo <https://github.com/microsoft/DeepSpeedExamples>`__:
While you always have to supply the DeepSpeed configuration file, you can configure the DeepSpeed integration in
several ways:
.. code-block:: bash
git clone https://github.com/microsoft/DeepSpeedExamples
cd DeepSpeedExamples
find . -name '*json'
Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the
example ``.json`` files with:
.. code-block:: bash
grep -i Lamb $(find . -name '*json')
Some more examples are to be found in the `main repo <https://github.com/microsoft/DeepSpeed>`__ as well.
When using DeepSpeed you always need to supply a DeepSpeed configuration file, yet some configuration parameters have
to be configured via the command line. You will find the nuances in the rest of this guide.
1. Supply most of the configuration inside the file, and just use a few required command line arguments. This is the
recommended way as it puts most of the configuration params in one place.
2. Supply just the ZeRO configuration params inside the file, and configure the rest using the normal
:class:`~transformers.Trainer` command line arguments.
3. Any variation of the first two ways.
To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
enables FP16, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler:
enables FP16, uses AdamW optimizer and WarmupLR scheduler:
.. code-block:: json
@@ -650,6 +271,7 @@ enables FP16, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler:
"weight_decay": 3e-7
}
},
"zero_allow_untested_optimizer": true,
"scheduler": {
"type": "WarmupLR",
@@ -661,36 +283,38 @@ enables FP16, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler:
}
}
If you already have a command line that you have been using with :class:`transformers.Trainer` args, you can continue
using those and the :class:`~transformers.Trainer` will automatically convert them into the corresponding DeepSpeed
configuration at run time. For example, you could use the following configuration file:
.. code-block:: json
{
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 5e8,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"contiguous_gradients": true,
"cpu_offload": true
}
}
and the following command line arguments:
.. code-block:: bash
--learning_rate 3e-5 --warmup_steps 500 --adam_beta1 0.8 --adam_beta2 0.999 --adam_epsilon 1e-8 \
--weight_decay 3e-7 --lr_scheduler_type constant_with_warmup --fp16 --fp16_backend amp
to achieve the same configuration as provided by the longer json file in the first example.
When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer`
to the console, so you can see exactly what was the final configuration passed to it.
to the console, so you can see exactly what the final configuration was passed to it.
Passing Configuration
=======================================================================================================================
As discussed in this document normally the DeepSpeed configuration is passed as a path to a json file, but if you're
not using the command line interface to configure the training, and instead instantiate the
:class:`~transformers.Trainer` via :class:`~transformers.TrainingArguments` then for the ``deepspeed`` argument you can
pass a nested ``dict``. This allows you to create the configuration on the fly and doesn't require you to write it to
the file system before passing it to :class:`~transformers.TrainingArguments`.
To summarize you can do:
.. code-block:: python
TrainingArguments(..., deespeed="/path/to/ds_config.json")
or:
.. code-block:: python
ds_config_dict=dict(scheduler=scheduler_params, optimizer=optimizer_params)
TrainingArguments(..., deespeed=ds_config_dict)
Shared Configuration
=======================================================================================================================
**Shared Configuration:**
Some configuration information is required by both the :class:`~transformers.Trainer` and DeepSpeed to function
correctly, therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to
@@ -714,8 +338,7 @@ Of course, you will need to adjust the values in this example to your situation.
ZeRO
=======================================================================================================================
**ZeRO:**
The ``zero_optimization`` section of the configuration file is the most important part (`docs
<https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`__), since that is where you define
@@ -742,52 +365,30 @@ Notes:
- ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x
the ``allgather_bucket_size`` and ``reduce_bucket_size`` values. So if they are set to 5e8, this requires a 9GB
footprint (``5e8 x 2Bytes x 2 x 4.5``). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting
OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB. You will want to do
the same on larger capacity GPU as well, if you're starting to hit OOM.
- when reducing these buffers you're trading communication speed to avail more GPU RAM. The smaller the buffer size,
the slower the communication, and the more GPU RAM will be available to other tasks. So if a bigger batch size is
important, getting a slightly slower training time could be a good trade.
OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB.
This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
no equivalent command line arguments.
Optimizer and Scheduler
=======================================================================================================================
As long as you don't enable ``cpu_offload`` you can mix and match DeepSpeed and HuggingFace schedulers and optimizers,
with the exception of using the combination of HuggingFace scheduler and DeepSpeed optimizer:
+--------------+--------------+--------------+
| Combos | HF Scheduler | DS Scheduler |
+--------------+--------------+--------------+
| HF Optimizer | Yes | Yes |
+--------------+--------------+--------------+
| DS Optimizer | No | Yes |
+--------------+--------------+--------------+
If ``cpu_offload`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer.
**Optimizer:**
Optimizer
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus
recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line
arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``.
Here is an example of the pre-configured ``optimizer`` entry for ``AdamW``:
Here is an example of the pre-configured ``optimizer`` entry for AdamW:
.. code-block:: json
{
"zero_allow_untested_optimizer": true,
"optimizer": {
"type": "AdamW",
"params": {
@@ -799,43 +400,48 @@ Here is an example of the pre-configured ``optimizer`` entry for ``AdamW``:
}
}
Note that the command line arguments will override the values in the configuration file. This is so that there is one
definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
different values in different places. Command line rules. The values that get overridden are:
- ``lr`` with the value of ``--learning_rate``
- ``betas`` with the value of ``--adam_beta1 --adam_beta2``
- ``eps`` with the value of ``--adam_epsilon``
- ``weight_decay`` with the value of ``--weight_decay``
Therefore please remember to tune the shared hyperparameters on the command line.
If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer":
true`` to the top level configuration.
Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add
``zero_allow_untested_optimizer`` flag.
If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
Scheduler
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
**Scheduler:**
DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
<https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
the value of ``--lr_scheduler_type`` to configure it. Currently the :class:`~transformers.Trainer` supports only 2 LR
schedulers that are also supported by DeepSpeed:
* ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup``
* ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``,
therefore, if you don't configure the scheduler this is scheduler that will get configured by default.
In either case, the values of ``--learning_rate`` and ``--warmup_steps`` will be used for the configuration.
If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version
of it.
In other words, if you don't use the configuration file to set the ``scheduler`` entry, provide either:
Here is an example of the pre-configured ``scheduler`` entry for ``WarmupLR``:
.. code-block:: bash
--lr_scheduler_type constant_with_warmup --learning_rate 3e-5 --warmup_steps 500
or
.. code-block:: bash
--lr_scheduler_type linear --learning_rate 3e-5 --warmup_steps 500
with the desired values. If you don't pass these arguments, reasonable default values will be used instead.
In the case of WarmupDecayLR ``total_num_steps`` gets set either via the ``--max_steps`` command line argument, or if
it is not provided, derived automatically at run time based on the environment and the size of the dataset and other
command line arguments.
Here is an example of the pre-configured ``scheduler`` entry for WarmupLR (``constant_with_warmup`` in the
:class:`~transformers.Trainer` API):
.. code-block:: json
@@ -850,41 +456,7 @@ Here is an example of the pre-configured ``scheduler`` entry for ``WarmupLR``:
}
}
Note that the command line arguments will override the values in the configuration file. This is so that there is one
definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
different values in different places. Command line rules. The values that get overridden are:
- ``warmup_max_lr`` with the value of ``--learning_rate``
- ``warmup_num_steps`` with the value of ``--warmup_steps``
- ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run
time based on the environment and the size of the dataset and other command line arguments (needed for
``WarmupDecayLR``).
Therefore please remember to tune the shared hyperparameters on the command line.
For example, for ``WarmupDecayLR``, you can use the following entry:
.. code-block:: json
{
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"total_num_steps": 10,
"last_batch_iteration": -1,
"warmup_min_lr": 0,
"warmup_max_lr": 0.001,
"warmup_num_steps": 1000
}
}
}
and ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be corrected at loading time.
Automatic Mixed Precision
=======================================================================================================================
**Automatic Mixed Precision:**
You can work with FP16 in one of the following ways:
@@ -892,7 +464,7 @@ You can work with FP16 in one of the following ways:
2. NVIDIA's apex, as documented `here
<https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
If you want to use an equivalent of the Pytorch native amp, you can either configure the ``fp16`` entry in the
If you want to use an equivalent of the pytorch native amp, you can either configure the ``fp16`` entry in the
configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``.
Here is an example of the ``fp16`` configuration:
@@ -924,31 +496,8 @@ Here is an example of the ``amp`` configuration:
}
Gradient Accumulation
=======================================================================================================================
While normally DeepSpeed gets gradient accumulation configured with:
.. code-block:: json
{
"gradient_accumulation_steps": 3,
}
in this case, to enable gradient accumulation, pass the command line `--gradient_accumulation_steps` argument as normal
and it will get injected into the DeepSpeed configuration.
If you try to add it directly to the configuration file, you will receive an error from the Trainer - this is because
this setting is needed by the Trainer too, and so this approach ensures that there is a single way of setting this
value and thus avoid potential subtle errors.
Gradient Clipping
=======================================================================================================================
**Gradient Clipping:**
If you don't configure the ``gradient_clipping`` entry in the configuration file, the :class:`~transformers.Trainer`
will use the value of the ``--max_grad_norm`` command line argument to set it.
@@ -963,30 +512,22 @@ Here is an example of the ``gradient_clipping`` configuration:
Notes
=======================================================================================================================
**Notes:**
* DeepSpeed works with the PyTorch :class:`~transformers.Trainer` but not TF :class:`~transformers.TFTrainer`.
* While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from `source
<https://github.com/microsoft/deepspeed#installation>`__ to best match your hardware and also if you need to enable
certain features, like 1-bit Adam, which aren't available in the pypi distribution.
* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with 🤗 Transformers - you can use any model
with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration instructions
<https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.
* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with HuggingFace ``transformers`` - you can
use any model with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration
instructions <https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.
Main DeepSpeed Resources
=======================================================================================================================
**Main DeepSpeed Resources:**
- `Project's github <https://github.com/microsoft/deepspeed>`__
- `github <https://github.com/microsoft/deepspeed>`__
- `Usage docs <https://www.deepspeed.ai/getting-started/>`__
- `API docs <https://deepspeed.readthedocs.io/en/latest/index.html>`__
- `Blog posts <https://www.microsoft.com/en-us/research/search/?q=deepspeed>`__
Papers:
- `ZeRO: Memory Optimizations Toward Training Trillion Parameter Models <https://arxiv.org/abs/1910.02054>`__
- `ZeRO-Offload: Democratizing Billion-Scale Model Training <https://arxiv.org/abs/2101.06840>`__
Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you
have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed GitHub
have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed github
<https://github.com/microsoft/DeepSpeed/issues>`__.

View File

@@ -189,52 +189,3 @@ FlaxAutoModel
.. autoclass:: transformers.FlaxAutoModel
:members:
FlaxAutoModelForPreTraining
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxAutoModelForPreTraining
:members:
FlaxAutoModelForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxAutoModelForMaskedLM
:members:
FlaxAutoModelForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxAutoModelForSequenceClassification
:members:
FlaxAutoModelForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxAutoModelForQuestionAnswering
:members:
FlaxAutoModelForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxAutoModelForTokenClassification
:members:
FlaxAutoModelForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxAutoModelForMultipleChoice
:members:
FlaxAutoModelForNextSentencePrediction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxAutoModelForNextSentencePrediction
:members:

View File

@@ -130,12 +130,6 @@ BartForQuestionAnswering
.. autoclass:: transformers.BartForQuestionAnswering
:members: forward
BartForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BartForCausalLM
:members: forward
TFBartModel

View File

@@ -209,50 +209,8 @@ FlaxBertModel
:members: __call__
FlaxBertForPreTraining
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBertForPreTraining
:members: __call__
FlaxBertForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBertForMaskedLM
:members: __call__
FlaxBertForNextSentencePrediction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBertForNextSentencePrediction
:members: __call__
FlaxBertForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBertForSequenceClassification
:members: __call__
FlaxBertForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBertForMultipleChoice
:members: __call__
FlaxBertForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBertForTokenClassification
:members: __call__
FlaxBertForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.FlaxBertForQuestionAnswering
:members: __call__

View File

@@ -1,130 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
BigBird
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The BigBird model was proposed in `Big Bird: Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by
Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
has been shown that applying sparse, global, and random attention approximates full attention, while being
computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
BigBird has shown improved performance on various long document NLP tasks, such as question answering and
summarization, compared to BERT or RoBERTa.
The abstract from the paper is the following:
*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
propose novel applications to genomics data.*
Tips:
- For an in-detail explanation on how BigBird's attention works, see `this blog post
<https://huggingface.co/blog/big-bird>`__.
- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
**original_full** is advised as there is no benefit in using **block_sparse** attention.
- The code currently uses window size of 3 blocks and 2 global blocks.
- Sequence length must be divisible by block size.
- Current implementation supports only **ITC**.
- Current implementation doesn't support **num_random_blocks = 0**
The original code can be found `here <https://github.com/google-research/bigbird>`__.
BigBirdConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdConfig
:members:
BigBirdTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
BigBird specific outputs
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
:members:
BigBirdModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdModel
:members: forward
BigBirdForPreTraining
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdForPreTraining
:members: forward
BigBirdForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdForCausalLM
:members: forward
BigBirdForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdForMaskedLM
:members: forward
BigBirdForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdForSequenceClassification
:members: forward
BigBirdForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdForMultipleChoice
:members: forward
BigBirdForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdForTokenClassification
:members: forward
BigBirdForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BigBirdForQuestionAnswering
:members: forward

View File

@@ -98,13 +98,6 @@ See :obj:`transformers.BartForConditionalGeneration` for arguments to `forward`
:members: forward
BlenderbotForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BlenderbotForCausalLM
:members: forward
TFBlenderbotModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -70,13 +70,6 @@ BlenderbotSmallForConditionalGeneration
:members: forward
BlenderbotSmallForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BlenderbotSmallForCausalLM
:members: forward
TFBlenderbotSmallModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -1,46 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
BORT
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The BORT model was proposed in `Optimal Subarchitecture Extraction for BERT <https://arxiv.org/abs/2010.10499>`__ by
Adrian de Wynter and Daniel J. Perry. It is an optimal subset of architectural parameters for the BERT, which the
authors refer to as "Bort".
The abstract from the paper is the following:
*We extract an optimal subset of architectural parameters for the BERT architecture from Devlin et al. (2018) by
applying recent breakthroughs in algorithms for neural architecture search. This optimal subset, which we refer to as
"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the
original BERT-large architecture, and 16% of the net size. Bort is also able to be pretrained in 288 GPU hours, which
is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large
(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same
hardware. It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the
architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
Tips:
- BORT's model architecture is based on BERT, so one can refer to :doc:`BERT's documentation page <bert>` for the
model's API as well as usage examples.
- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, so one can refer to :doc:`RoBERTa's documentation page
<roberta>` for the tokenizer's API as well as usage examples.
- BORT requires a specific fine-tuning algorithm, called `Agora
<https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology>`__ ,
that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
algorithm to make BORT fine-tuning work.
The original code can be found `here <https://github.com/alexa/bort/>`__.

View File

@@ -1,144 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
ConvBERT
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The ConvBERT model was proposed in `ConvBERT: Improving BERT with Span-based Dynamic Convolution
<https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng
Yan.
The abstract from the paper is the following:
*Pre-trained language models like BERT and its variants have recently achieved impressive performance in various
natural language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers
large memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
using less than 1/4 training cost. Code and pre-trained models will be released.*
ConvBERT training tips are similar to those of BERT. The original implementation can be found here:
https://github.com/yitu-opensource/ConvBert
ConvBertConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertConfig
:members:
ConvBertTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
ConvBertTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertTokenizerFast
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
ConvBertModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertModel
:members: forward
ConvBertForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertForMaskedLM
:members: forward
ConvBertForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertForSequenceClassification
:members: forward
ConvBertForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertForMultipleChoice
:members: forward
ConvBertForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertForTokenClassification
:members: forward
ConvBertForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ConvBertForQuestionAnswering
:members: forward
TFConvBertModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFConvBertModel
:members: call
TFConvBertForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFConvBertForMaskedLM
:members: call
TFConvBertForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFConvBertForSequenceClassification
:members: call
TFConvBertForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFConvBertForMultipleChoice
:members: call
TFConvBertForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFConvBertForTokenClassification
:members: call
TFConvBertForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFConvBertForQuestionAnswering
:members: call

View File

@@ -60,7 +60,7 @@ DebertaModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaModel
:members: forward
:members:
DebertaPreTrainedModel
@@ -70,29 +70,8 @@ DebertaPreTrainedModel
:members:
DebertaForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaForMaskedLM
:members: forward
DebertaForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaForSequenceClassification
:members: forward
DebertaForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaForTokenClassification
:members: forward
DebertaForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaForQuestionAnswering
:members: forward
:members:

View File

@@ -1,118 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
DeBERTa-v2
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
<https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
RoBERTa.
The abstract from the paper is the following:
*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
disentangled attention mechanism, where each word is represented using two vectors that encode its content and
position, respectively, and the attention weights among words are computed using disentangled matrices on their
contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
The following information is visible directly on the [original implementation
repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
find more details about this submission in the authors'
[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
New in v2:
- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
Instead of a GPT2-based tokenizer, the tokenizer is now
[sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
transformer layer to better learn the local dependency of input tokens.
- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative postions** The DeBERTa-v2 model uses log bucket to encode relative positions
similar to T5.
- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
performance of downstream tasks.
The original code can be found `here <https://github.com/microsoft/DeBERTa>`__.
DebertaV2Config
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2Config
:members:
DebertaV2Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2Tokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
DebertaV2Model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2Model
:members: forward
DebertaV2PreTrainedModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2PreTrainedModel
:members: forward
DebertaV2ForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2ForMaskedLM
:members: forward
DebertaV2ForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2ForSequenceClassification
:members: forward
DebertaV2ForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2ForTokenClassification
:members: forward
DebertaV2ForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DebertaV2ForQuestionAnswering
:members: forward

View File

@@ -48,6 +48,7 @@ modeling. We first concatenate all dialog turns within a dialogue session into a
sequence length), ended by the end-of-text token.* For more information please confer to the original paper.
DialoGPT's architecture is based on the GPT2 model, so one can refer to :doc:`GPT2's documentation page <gpt2>`.
DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring
<https://huggingface.co/transformers/model_doc/gpt2.html>`_.
The original code can be found `here <https://github.com/microsoft/DialoGPT>`_.

View File

@@ -56,7 +56,7 @@ FSMTTokenizer
.. autoclass:: transformers.FSMTTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary
FSMTModel

View File

@@ -50,7 +50,7 @@ The original code can be found `here <https://github.com/openai/finetune-transfo
Note:
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install ``ftfy``
and ``SpaCy``:
and ``SpaCy``::
.. code-block:: bash

View File

@@ -1,65 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
GPT Neo
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The GPTNeo model was released in the `EleutherAI/gpt-neo <https://github.com/EleutherAI/gpt-neo>`__ repository by Sid
Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like causal language model trained on the
`Pile <https://pile.eleuther.ai/>`__ dataset.
The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
256 tokens.
Generation
_______________________________________________________________________________________________________________________
The :obj:`generate()` method can be used to generate text using GPT Neo model.
.. code-block::
>>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
>>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
>>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
... "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
... "researchers was the fact that the unicorns spoke perfect English."
>>> input_ids = tokenizer(unicorns, return_tensors="pt").input_ids
>>> gen_tokens = model.generate(ids, do_sample=True, temperature=0.9, max_length=100,)
>>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
GPTNeoConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GPTNeoConfig
:members:
GPTNeoModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GPTNeoModel
:members: forward
GPTNeoForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.GPTNeoForCausalLM
:members: forward

View File

@@ -1,88 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
I-BERT
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The I-BERT model was proposed in `I-BERT: Integer-only BERT Quantization <https://arxiv.org/abs/2101.01321>`__ by
Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney and Kurt Keutzer. It's a quantized version of RoBERTa running
inference up to four times faster.
The abstract from the paper is the following:
*Transformer based models, like BERT and RoBERTa, have achieved state-of-the-art results in many Natural Language
Processing tasks. However, their memory footprint, inference latency, and power consumption are prohibitive for
efficient inference at the edge, and even at the data center. While quantization can be a viable solution for this,
previous work on quantizing Transformer based models use floating-point arithmetic during inference, which cannot
efficiently utilize integer-only logical units such as the recent Turing Tensor Cores, or traditional integer-only ARM
processors. In this work, we propose I-BERT, a novel quantization scheme for Transformer based models that quantizes
the entire inference with integer-only arithmetic. Based on lightweight integer-only approximation methods for
nonlinear operations, e.g., GELU, Softmax, and Layer Normalization, I-BERT performs an end-to-end integer-only BERT
inference without any floating point calculation. We evaluate our approach on GLUE downstream tasks using
RoBERTa-Base/Large. We show that for both cases, I-BERT achieves similar (and slightly higher) accuracy as compared to
the full-precision baseline. Furthermore, our preliminary implementation of I-BERT shows a speedup of 2.4 - 4.0x for
INT8 inference on a T4 GPU system as compared to FP32 inference. The framework has been developed in PyTorch and has
been open-sourced.*
The original code can be found `here <https://github.com/kssteven418/I-BERT>`__.
IBertConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.IBertConfig
:members:
IBertModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.IBertModel
:members: forward
IBertForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.IBertForMaskedLM
:members: forward
IBertForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.IBertForSequenceClassification
:members: forward
IBertForMultipleChoice
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.IBertForMultipleChoice
:members: forward
IBertForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.IBertForTokenClassification
:members: forward
IBertForQuestionAnswering
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.IBertForQuestionAnswering
:members: forward

View File

@@ -130,31 +130,3 @@ LayoutLMForTokenClassification
.. autoclass:: transformers.LayoutLMForTokenClassification
:members:
TFLayoutLMModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFLayoutLMModel
:members:
TFLayoutLMForMaskedLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFLayoutLMForMaskedLM
:members:
TFLayoutLMForSequenceClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFLayoutLMForSequenceClassification
:members:
TFLayoutLMForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFLayoutLMForTokenClassification
:members:

View File

@@ -1,128 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
M2M100
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The M2M100 model was proposed in `Beyond English-Centric Multilingual Machine Translation
<https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky,
Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy
Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
The abstract from the paper is the following:
*Existing work in translation demonstrated the potential of massively multilingual machine translation by training a
single model able to translate between any pair of languages. However, much of this work is English-Centric by training
only on data which was translated from or to English. While this is supported by large sources of training data, it
does not reflect translation needs worldwide. In this work, we create a true Many-to-Many multilingual translation
model that can translate directly between any pair of 100 languages. We build and open source a training dataset that
covers thousands of language directions with supervised data, created through large-scale mining. Then, we explore how
to effectively increase model capacity through a combination of dense scaling and language-specific sparse parameters
to create high quality models. Our focus on non-English-Centric models brings gains of more than 10 BLEU when directly
translating between non-English directions while performing competitively to the best single systems of WMT. We
open-source our scripts so that others may reproduce the data, evaluation, and final M2M-100 model.*
Training and Generation
_______________________________________________________________________________________________________________________
M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. As the model is
multilingual it expects the sequences in a certain format: A special language id token is used as prefix in both the
source and target text. The source text format is :obj:`[lang_code] X [eos]`, where :obj:`lang_code` is source language
id for source text and target language id for target text, with :obj:`X` being the source or target text.
The :class:`~transformers.M2M100Tokenizer` depends on :obj:`sentencepiece` so be sure to install it before running the
examples. To install :obj:`sentencepiece` run ``pip install sentencepiece``.
- Supervised Training
.. code-block::
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")
src_text = "Life is like a box of chocolates."
tgt_lang = "La vie est comme une boîte de chocolat."
model_inputs = tokenizer(src_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
loss = model(**model_inputs, labels=labels) # forward pass
- Generation
M2M100 uses the :obj:`eos_token_id` as the :obj:`decoder_start_token_id` for generation with the target language id
being forced as the first generated token. To force the target language id as the first generated token, pass the
`forced_bos_token_id` parameter to the `generate` method. The following example shows how to translate between
Hindi to French and Chinese to English using the `facebook/m2m100_418M` checkpoint.
.. code-block::
>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
>>> hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
>>> chinese_text = "生活就像一盒巧克力。"
>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
>>> # translate Hindi to French
>>> tokenizer.src_lang = "hi"
>>> encoded_hi = tokenizer(hi_text, return_tensors="pt")
>>> generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
"La vie est comme une boîte de chocolat."
>>> # translate Chinese to English
>>> tokenizer.src_lang = "zh"
>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
"Life is like a box of chocolate."
M2M100Config
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.M2M100Config
:members:
M2M100Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.M2M100Tokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
M2M100Model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.M2M100Model
:members: forward
M2M100ForConditionalGeneration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.M2M100ForConditionalGeneration
:members: forward

View File

@@ -76,29 +76,27 @@ require 3 character language codes:
.. code-block:: python
>>> from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [
... '>>fra<< this is a sentence in english that we want to translate to french',
... '>>por<< This should go to portuguese',
... '>>esp<< And this to Spanish'
>>> ]
from transformers import MarianMTModel, MarianTokenizer
src_text = [
'>>fra<< this is a sentence in english that we want to translate to french',
'>>por<< This should go to portuguese',
'>>esp<< And this to Spanish'
]
>>> model_name = 'Helsinki-NLP/opus-mt-en-roa'
>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
>>> print(tokenizer.supported_language_codes)
['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
>>> model = MarianMTModel.from_pretrained(model_name)
>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
>>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
["c'est une phrase en anglais que nous voulons traduire en français",
'Isto deve ir para o português.',
'Y esto al español']
model_name = 'Helsinki-NLP/opus-mt-en-roa'
tokenizer = MarianTokenizer.from_pretrained(model_name)
print(tokenizer.supported_language_codes)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# ["c'est une phrase en anglais que nous voulons traduire en français",
# 'Isto deve ir para o português.',
# 'Y esto al español']
Here is the code to see all available pretrained models on the hub:
Code to see available pretrained models:
.. code-block:: python
@@ -149,22 +147,21 @@ Example of translating english to many romance languages, using old-style 2 char
.. code-block::python
>>> from transformers import MarianMTModel, MarianTokenizer
>>> src_text = [
... '>>fr<< this is a sentence in english that we want to translate to french',
... '>>pt<< This should go to portuguese',
... '>>es<< And this to Spanish'
>>> ]
from transformers import MarianMTModel, MarianTokenizer
src_text = [
'>>fr<< this is a sentence in english that we want to translate to french',
'>>pt<< This should go to portuguese',
'>>es<< And this to Spanish'
]
>>> model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
>>> tokenizer = MarianTokenizer.from_pretrained(model_name)
model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
tokenizer = MarianTokenizer.from_pretrained(model_name)
print(tokenizer.supported_language_codes)
>>> model = MarianMTModel.from_pretrained(model_name)
>>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
>>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
["c'est une phrase en anglais que nous voulons traduire en français",
'Isto deve ir para o português.',
'Y esto al español']
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
# ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.', 'Y esto al español']
@@ -179,7 +176,7 @@ MarianTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.MarianTokenizer
:members: as_target_tokenizer
:members: prepare_seq2seq_batch
MarianModel
@@ -196,13 +193,6 @@ MarianMTModel
:members: forward
MarianForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.MarianForCausalLM
:members: forward
TFMarianModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -10,14 +10,14 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
MBart and MBart-50
MBart
-----------------------------------------------------------------------------------------------------------------------
**DISCLAIMER:** If you see something strange, file a `Github Issue
<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
@patrickvonplaten
Overview of MBart
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The MBart model was presented in `Multilingual Denoising Pre-training for Neural Machine Translation
@@ -31,34 +31,33 @@ on the encoder, decoder, or reconstructing parts of the text.
The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__
Training of MBart
Examples
_______________________________________________________________________________________________________________________
MBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for translation task. As the
model is multilingual it expects the sequences in a different format. A special language id token is added in both the
source and target text. The source text format is :obj:`X [eos, src_lang_code]` where :obj:`X` is the source text. The
target text format is :obj:`[tgt_lang_code] X [eos]`. :obj:`bos` is never used.
- Examples and scripts for fine-tuning mBART and other models for sequence to sequence tasks can be found in
:prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
- Given the large embeddings table, mBART consumes a large amount of GPU RAM, especially for fine-tuning.
:class:`MarianMTModel` is usually a better choice for bilingual machine translation.
The regular :meth:`~transformers.MBartTokenizer.__call__` will encode source text format, and it should be wrapped
inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokenizer` to encode target text format.
Training
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MBart is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation task. As the model is
multilingual it expects the sequences in a different format. A special language id token is added in both the source
and target text. The source text format is :obj:`X [eos, src_lang_code]` where :obj:`X` is the source text. The target
text format is :obj:`[tgt_lang_code] X [eos]`. :obj:`bos` is never used.
The :meth:`~transformers.MBartTokenizer.prepare_seq2seq_batch` handles this automatically and should be used to encode
the sequences for sequence-to-sequence fine-tuning.
- Supervised training
.. code-block::
>>> from transformers import MBartForConditionalGeneration, MBartTokenizer
>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
>>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt", src_lang="en_XX", tgt_lang="ro_RO")
>>> with tokenizer.as_target_tokenizer():
... labels = tokenizer(expected_translation_romanian, return_tensors="pt")
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> # forward pass
>>> model(**inputs, labels=batch['labels'])
example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt")
model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass
- Generation
@@ -67,95 +66,14 @@ inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokeniz
.. code-block::
>>> from transformers import MBartForConditionalGeneration, MBartTokenizer
>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
>>> article = "UN Chief Says There Is No Military Solution in Syria"
>>> inputs = tokenizer(article, return_tensors="pt")
>>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
>>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
"Şeful ONU declară că nu există o soluţie militară în Siria"
Overview of MBart-50
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
MBart-50 was introduced in the `Multilingual Translation with Extensible Multilingual Pretraining and Finetuning
<https://arxiv.org/abs/2008.00401>` paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original `mbart-large-cc25` checkpoint by extendeding
its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
languages.
According to the abstract
*Multilingual translation models can be created through multilingual finetuning. Instead of finetuning on one
direction, a pretrained model is finetuned on many directions at the same time. It demonstrates that pretrained models
can be extended to incorporate additional languages without loss of performance. Multilingual finetuning improves on
average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while
improving 9.3 BLEU on average over bilingual baselines from scratch.*
Training of MBart-50
_______________________________________________________________________________________________________________________
The text format for MBart-50 is slightly different from mBART. For MBart-50 the language id token is used as a prefix
for both source and target text i.e the text format is :obj:`[lang_code] X [eos]`, where :obj:`lang_code` is source
language id for source text and target language id for target text, with :obj:`X` being the source or target text
respectively.
MBart-50 has its own tokenizer :class:`~transformers.MBart50Tokenizer`.
- Supervised training
.. code-block::
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
src_text = " UN Chief Says There Is No Military Solution in Syria"
tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
model_inputs = tokenizer(src_text, return_tensors="pt")
with tokenizer.as_target_tokenizer():
labels = tokenizer(tgt_text, return_tensors="pt").input_ids
model(**model_inputs, labels=labels) # forward pass
- Generation
To generate using the mBART-50 multilingual translation models, :obj:`eos_token_id` is used as the
:obj:`decoder_start_token_id` and the target language id is forced as the first generated token. To force the
target language id as the first generated token, pass the `forced_bos_token_id` parameter to the `generate` method.
The following example shows how to translate between Hindi to French and Arabic to English using the
`facebook/mbart-50-large-many-to-many` checkpoint.
.. code-block::
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
# translate Hindi to French
tokenizer.src_lang = "hi_IN"
encoded_hi = tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."
# translate Arabic to English
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(article_ar, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "The Secretary-General of the United Nations says there is no military solution in Syria."
from transformers import MBartForConditionalGeneration, MBartTokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
article = "UN Chief Says There Is No Military Solution in Syria"
batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt")
translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
MBartConfig
@@ -169,7 +87,7 @@ MBartTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.MBartTokenizer
:members: as_target_tokenizer, build_inputs_with_special_tokens
:members: build_inputs_with_special_tokens, prepare_seq2seq_batch
MBartTokenizerFast
@@ -179,20 +97,6 @@ MBartTokenizerFast
:members:
MBart50Tokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.MBart50Tokenizer
:members:
MBart50TokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.MBart50TokenizerFast
:members:
MBartModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -220,13 +124,6 @@ MBartForSequenceClassification
.. autoclass:: transformers.MBartForSequenceClassification
MBartForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.MBartForCausalLM
:members: forward
TFMBartModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -51,8 +51,8 @@ All the `checkpoints <https://huggingface.co/models?search=pegasus>`__ are fine-
Examples
_______________________________________________________________________________________________________________________
- :prefix_link:`Script <examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh>` to fine-tune pegasus
on the XSUM dataset. Data download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
- :prefix_link:`Script <examples/seq2seq/finetune_pegasus_xsum.sh>` to fine-tune pegasus on the XSUM dataset. Data
download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
- FP16 is not supported (help/ideas on this appreciated!).
- The adafactor optimizer is recommended for pegasus fine-tuning.
@@ -78,20 +78,20 @@ Usage Example
.. code-block:: python
>>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
>>> import torch
>>> src_text = [
... """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
>>> ]
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
src_text = [
""" PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
]
>>> model_name = 'google/pegasus-xsum'
>>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
>>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
>>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
>>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
>>> translated = model.generate(**batch)
>>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
>>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
model_name = 'google/pegasus-xsum'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
@@ -107,7 +107,7 @@ PegasusTokenizer
warning: ``add_tokens`` does not work at the moment.
.. autoclass:: transformers.PegasusTokenizer
:members:
:members: __call__, prepare_seq2seq_batch
PegasusTokenizerFast
@@ -131,13 +131,6 @@ PegasusForConditionalGeneration
:members: forward
PegasusForCausalLM
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.PegasusForCausalLM
:members: forward
TFPegasusModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

View File

@@ -56,7 +56,7 @@ RagTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.RagTokenizer
:members:
:members: prepare_seq2seq_batch
Rag specific outputs
@@ -94,24 +94,3 @@ RagTokenForGeneration
.. autoclass:: transformers.RagTokenForGeneration
:members: forward, generate
TFRagModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFRagModel
:members: call
TFRagSequenceForGeneration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFRagSequenceForGeneration
:members: call, generate
TFRagTokenForGeneration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFRagTokenForGeneration
:members: call, generate

View File

@@ -1,152 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
Speech2Text
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Speech2Text model was proposed in `fairseq S2T: Fast Speech-to-Text Modeling with fairseq
<https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech
Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are
fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the
transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST:
`LibriSpeech <http://www.openslr.org/12>`__, `CoVoST 2 <https://github.com/facebookresearch/covost>`__, `MuST-C
<https://ict.fbk.eu/must-c/>`__.
The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text>`__.
Inference
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The
:obj:`generate()` method can be used for inference.
The :class:`~transformers.Speech2TextFeatureExtractor` class is responsible for extracting the log-mel filter-bank
features. The :class:`~transformers.Speech2TextProcessor` wraps :class:`~transformers.Speech2TextFeatureExtractor` and
:class:`~transformers.Speech2TextTokenizer` into a single instance to both extract the input features and decode the
predicted token ids.
The feature extractor depends on :obj:`torchaudio` and the tokenizer depends on :obj:`sentencepiece` so be sure to
install those packages before running the examples. You could either install those as extra speech dependancies with
``pip install transformers"[speech, sentencepiece]"`` or install the packages seperatly with ``pip install torchaudio
sentencepiece``. Also ``torchaudio`` requires the development version of the `libsndfile
<http://www.mega-nerd.com/libsndfile/>`__ package which can be installed via a system package manager. On Ubuntu it can
be installed as follows: ``apt install libsndfile1-dev``
- ASR and Speech Translation
.. code-block::
>>> import torch
>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
>>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
>>> transcription = processor.batch_decode(generated_ids)
- Multilingual speech translation
For multilingual speech translation models, :obj:`eos_token_id` is used as the :obj:`decoder_start_token_id` and
the target language id is forced as the first generated token. To force the target language id as the first
generated token, pass the :obj:`forced_bos_token_id` parameter to the :obj:`generate()` method. The following
example shows how to transate English speech to French text using the `facebook/s2t-medium-mustc-multilingual-st`
checkpoint.
.. code-block::
>>> import torch
>>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
>>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
>>> def map_to_array(batch):
... speech, _ = sf.read(batch["file"])
... batch["speech"] = speech
... return batch
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
>>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
>>> translation = processor.batch_decode(generated_ids)
See the `model hub <https://huggingface.co/models?filter=speech_to_text>`__ to look for Speech2Text checkpoints.
Speech2TextConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2TextConfig
:members:
Speech2TextTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2TextTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
Speech2TextFeatureExtractor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2TextFeatureExtractor
:members: __call__
Speech2TextProcessor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2TextProcessor
:members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
Speech2TextModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2TextModel
:members: forward
Speech2TextForConditionalGeneration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Speech2TextForConditionalGeneration
:members: forward

View File

@@ -104,7 +104,7 @@ T5Tokenizer
.. autoclass:: transformers.T5Tokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary
create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary
T5TokenizerFast

View File

@@ -1,102 +0,0 @@
..
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
Vision Transformer (ViT)
-----------------------------------------------------------------------------------------------------------------------
.. note::
This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
breaking changes to fix it in the future. If you see something strange, file a `Github Issue
<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Vision Transformer (ViT) model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition
at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
very good results compared to familiar convolutional architectures.
The abstract from the paper is the following:
*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
applications to computer vision remain limited. In vision, attention is either applied in conjunction with
convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
substantially fewer computational resources to train.*
Tips:
- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
vectors to a standard Transformer encoder.
- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
et al., 2020) <https://arxiv.org/abs/1912.11370>`__. The authors report the best results with a resolution of 384x384
during fine-tuning.
- As the Vision Transformer expects each image to be of the same size (resolution), one can use
:class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model.
- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
each checkpoint. For example, :obj:`google/vit-base-patch16-224` refers to a base-sized architecture with patch
resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
<https://huggingface.co/models?search=vit>`__.
- The available checkpoints are either (1) pre-trained on `ImageNet-21k <http://www.image-net.org/>`__ (a collection of
14 million images and 21k classes) only, or (2) also fine-tuned on `ImageNet
<http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
images and 1,000 classes).
- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
The original code (written in JAX) can be found `here <https://github.com/google-research/vision_transformer>`__.
Note that we converted the weights from Ross Wightman's `timm library
<https://github.com/rwightman/pytorch-image-models>`__, who already converted the weights from JAX to PyTorch. Credits
go to him!
ViTConfig
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ViTConfig
:members:
ViTFeatureExtractor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ViTFeatureExtractor
:members: __call__
ViTModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ViTModel
:members: forward
ViTForImageClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.ViTForImageClassification
:members: forward

View File

@@ -1,79 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
Wav2Vec2
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Wav2Vec2 model was proposed in `wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations
<https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
The abstract from the paper is the following:
*We show for the first time that learning powerful representations from speech audio alone followed by fine-tuning on
transcribed speech can outperform the best semi-supervised methods while being conceptually simpler. wav2vec 2.0 masks
the speech input in the latent space and solves a contrastive task defined over a quantization of the latent
representations which are jointly learned. Experiments using all labeled data of Librispeech achieve 1.8/3.3 WER on the
clean/other test sets. When lowering the amount of labeled data to one hour, wav2vec 2.0 outperforms the previous state
of the art on the 100 hour subset while using 100 times less labeled data. Using just ten minutes of labeled data and
pre-training on 53k hours of unlabeled data still achieves 4.8/8.2 WER. This demonstrates the feasibility of speech
recognition with limited amounts of labeled data.*
Tips:
- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
using :class:`~transformers.Wav2Vec2CTCTokenizer`.
Wav2Vec2Config
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2Config
:members:
Wav2Vec2CTCTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2CTCTokenizer
:members: __call__, save_vocabulary
Wav2Vec2FeatureExtractor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2FeatureExtractor
:members: __call__
Wav2Vec2Processor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2Processor
:members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
Wav2Vec2Model
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2Model
:members: forward
Wav2Vec2ForCTC
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Wav2Vec2ForCTC
:members: forward

View File

@@ -1,45 +0,0 @@
..
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
XLSR-Wav2Vec2
-----------------------------------------------------------------------------------------------------------------------
Overview
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The XLSR-Wav2Vec2 model was proposed in `Unsupervised Cross-Lingual Representation Learning For Speech Recognition
<https://arxiv.org/abs/2006.13979>`__ by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
Auli.
The abstract from the paper is the following:
*This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw
waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over
masked latent speech representations and jointly learns a quantization of the latents shared across languages. The
resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly
outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction
of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to
a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong
individual models. Analysis shows that the latent discrete speech representations are shared across languages with
increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing
XLSR-53, a large model pretrained in 53 languages.*
Tips:
- XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
- XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be
decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to :doc:`Wav2Vec2's documentation page
<wav2vec2>`.
The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec>`__.

View File

@@ -60,7 +60,7 @@ Basic steps
In order to upload a model, you'll need to first create a git repo. This repo will live on the model hub, allowing
users to clone it and you (and your organization members) to push to it.
You can create a model repo directly from `the /new page on the website <https://huggingface.co/new>`__.
You can create a model repo **directly from `the /new page on the website <https://huggingface.co/new>`__.**
Alternatively, you can use the ``transformers-cli``. The next steps describe that process:

View File

@@ -330,36 +330,6 @@ the same probabilities as the larger model. The actual objective is a combinatio
The library provides a version of the model for masked language modeling, token classification, sentence classification
and question answering.
ConvBERT
-----------------------------------------------------------------------------------------------------------------------
.. raw:: html
<a href="https://huggingface.co/models?filter=convbert">
<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
</a>
<a href="model_doc/convbert.html">
<img alt="Doc" src="https://img.shields.io/badge/Model_documentation-convbert-blueviolet">
</a>
`ConvBERT: Improving BERT with Span-based Dynamic Convolution <https://arxiv.org/abs/1910.01108>`_, Zihang Jiang,
Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
Pre-trained language models like BERT and its variants have recently achieved impressive performance in various natural
language understanding tasks. However, BERT heavily relies on the global self-attention block and thus suffers large
memory footprint and computation cost. Although all its attention heads query on the whole input sequence for
generating the attention map from a global perspective, we observe some heads only need to learn local dependencies,
which means the existence of computation redundancy. We therefore propose a novel span-based dynamic convolution to
replace these self-attention heads to directly model local dependencies. The novel convolution heads, together with the
rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context
learning. We equip BERT with this mixed attention design and build a ConvBERT model. Experiments have shown that
ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and
fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
using less than 1/4 training cost.
The library provides a version of the model for masked language modeling, token classification, sentence classification
and question answering.
XLM
-----------------------------------------------------------------------------------------------------------------------

View File

@@ -18,8 +18,8 @@ that the metric applies specifically to classical language models (sometimes cal
models) and is not well defined for masked language models like BERT (see :doc:`summary of the models
<model_summary>`).
Perplexity is defined as the exponentiated average negative log-likelihood of a sequence. If we have a tokenized
sequence :math:`X = (x_0, x_1, \dots, x_t)`, then the perplexity of :math:`X` is,
Perplexity is defined as the exponentiated average log-likelihood of a sequence. If we have a tokenized sequence
:math:`X = (x_0, x_1, \dots, x_t)`, then the perplexity of :math:`X` is,
.. math::

View File

@@ -139,12 +139,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
| | ``gpt2-xl`` | | 48-layer, 1600-hidden, 25-heads, 1558M parameters. |
| | | | OpenAI's XL-sized GPT-2 English model |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| GPTNeo | ``EleutherAI/gpt-neo-1.3B`` | | 24-layer, 2048-hidden, 16-heads, 1.3B parameters. |
| | | | EleutherAI's GPT-3 like language model. |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``EleutherAI/gpt-neo-2.7B`` | | 32-layer, 2560-hidden, 20-heads, 2.7B parameters. |
| | | | EleutherAI's GPT-3 like language model. |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Transformer-XL | ``transfo-xl-wt103`` | | 18-layer, 1024-hidden, 16-heads, 257M parameters. |
| | | | English model trained on wikitext-103 |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -371,12 +365,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
| | ``reformer-crime-and-punishment`` | | 6-layer, 256-hidden, 2-heads, 3M parameters |
| | | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky. |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| M2M100 | ``facebook/m2m100_418M`` | | 24-layer, 1024-hidden, 16-heads, 418M parameters |
| | | | multilingual machine translation model for 100 languages |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``facebook/m2m100_1.2B`` | | 48-layer, 1024-hidden, 16-heads, 1.2B parameters |
| | | | multilingual machine translation model for 100 languages |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| MarianMT | ``Helsinki-NLP/opus-mt-{src}-{tgt}`` | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size. |
| | | | (see `model list <https://huggingface.co/Helsinki-NLP>`_) |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -393,15 +381,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``facebook/mbart-large-en-ro`` | | 24-layer, 1024-hidden, 16-heads, 610M parameters |
| | | | mbart-large-cc25 model finetuned on WMT english romanian translation. |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``facebook/mbart-large-50`` | | 24-layer, 1024-hidden, 16-heads, |
| | | | mBART model trained on 50 languages' monolingual corpus. |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``facebook/mbart-large-50-one-to-many-mmt`` | | 24-layer, 1024-hidden, 16-heads, |
| | | | mbart-50-large model finetuned for one (English) to many multilingual machine translation covering 50 languages. |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``facebook/mbart-large-50-many-to-many-mmt`` | | 24-layer, 1024-hidden, 16-heads, |
| | | | mbart-50-large model finetuned for many to many multilingual machine translation covering 50 languages. |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| Lxmert | ``lxmert-base-uncased`` | | 9-language layers, 9-relationship layers, and 12-cross-modality layers |
| | | | 768-hidden, 12-heads (for each layer) ~ 228M parameters |
@@ -455,30 +434,15 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
| | | |
| | | (see `details <https://github.com/microsoft/unilm/tree/master/layoutlm>`__) |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| DeBERTa | ``microsoft/deberta-base`` | | 12-layer, 768-hidden, 12-heads, ~140M parameters |
| DeBERTa | ``microsoft/deberta-base`` | | 12-layer, 768-hidden, 12-heads, ~125M parameters |
| | | | DeBERTa using the BERT-base architecture |
| | | |
| | | (see `details <https://github.com/microsoft/DeBERTa>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``microsoft/deberta-large`` | | 24-layer, 1024-hidden, 16-heads, ~400M parameters |
| | ``microsoft/deberta-large`` | | 24-layer, 1024-hidden, 16-heads, ~390M parameters |
| | | | DeBERTa using the BERT-large architecture |
| | | |
| | | (see `details <https://github.com/microsoft/DeBERTa>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``microsoft/deberta-xlarge`` | | 48-layer, 1024-hidden, 16-heads, ~750M parameters |
| | | | DeBERTa XLarge with similar BERT architecture |
| | | |
| | | (see `details <https://github.com/microsoft/DeBERTa>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``microsoft/deberta-xlarge-v2`` | | 24-layer, 1536-hidden, 24-heads, ~900M parameters |
| | | | DeBERTa XLarge V2 with similar BERT architecture |
| | | |
| | | (see `details <https://github.com/microsoft/DeBERTa>`__) |
| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| | ``microsoft/deberta-xxlarge-v2`` | | 48-layer, 1536-hidden, 24-heads, ~1.5B parameters |
| | | | DeBERTa XXLarge V2 with similar BERT architecture |
| | | |
| | | (see `details <https://github.com/microsoft/DeBERTa>`__) |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
| SqueezeBERT | ``squeezebert/squeezebert-uncased`` | | 12-layer, 768-hidden, 12-heads, 51M parameters, 4.3x faster than bert-base-uncased on a smartphone. |
| | | | SqueezeBERT architecture pretrained from scratch on masked language model (MLM) and sentence order prediction (SOP) tasks. |

View File

@@ -1,393 +0,0 @@
<!---
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Run training on Amazon SageMaker
Hugging Face and Amazon are introducing new [Hugging Face Deep Learning Containers (DLCs)](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers) to make it easier than ever to train Hugging Face Transformer models in [Amazon SageMaker](https://aws.amazon.com/sagemaker/).
To learn how to access and use the new Hugging Face DLCs with the Amazon SageMaker Python SDK, check out the guides and resources below.
---
## Deep Learning Container (DLC) overview
The Deep Learning Container are in every available where Amazon SageMaker is available. You can see the [AWS region table](https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/) for all AWS global infrastructure. To get an detailed overview of all included packages look [here in the release notes](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html).
| 🤗 Transformers version | 🤗 Datasets version | PyTorch/TensorFlow version | type | device | Python Version | Example `image_uri` |
| ----------------------- | ------------------- | -------------------------- | -------- | ------ | -------------- | --------------------------------------------------------------------------------------------------------------------------------- |
| 4.4.2 | 1.5.0 | PyTorch 1.6.0 | training | GPU | 3.6 | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04` |
| 4.4.2 | 1.5.0 | TensorFlow 2.4.1 | training | GPU | 3.7 | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04` |
---
## Getting Started: Train a 🤗 Transformers Model
To train a 🤗 Transformers model by using the `HuggingFace` SageMaker Python SDK you need to:
- [Prepare a training script](#prepare-a-transformers-fine-tuning-script)
- [Create a `HuggingFace` Estimator](#create-an-huggingface-estimator)
- [Run training by calling the `fit` method](#execute-training)
- [Access you model](#access-trained-model)
### Setup & Installation
Before you can train a transformers models with Amazon SageMaker you need to sign up for an AWS account. If you do not have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).
After you complete these tasks you can get started using either [SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-studio-onboard.html), [SageMaker Notebook Instances](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html), or a local environment. To start training locally you need configure the right [IAM permission](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html).
Upgrade to the latest `sagemaker` version.
```bash
pip install sagemaker --upgrade
```
**SageMaker environment**
_Note: The execution role is intended to be available only when running a notebook within SageMaker. If you run `get_execution_role` in a notebook not on SageMaker, expect a "region" error._
```python
import sagemaker
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
```
**Local environment**
```python
import sagemaker
import boto3
iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='role-name-of-your-iam-role-with-right-permissions')['Role']['Arn']
sess = sagemaker.Session()
```
### Prepare a 🤗 Transformers fine-tuning script.
The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, including the following:
- `SM_MODEL_DIR`: A string that represents the path where the training job writes the model artifacts to. After training, artifacts in this directory are uploaded to S3 for model hosting. `SM_MODEL_DIR` is always set to `/opt/ml/model`.
- `SM_NUM_GPUS`: An integer representing the number of GPUs available to the host.
- `SM_CHANNEL_XXXX:` A string that represents the path to the directory that contains the input data for the specified channel. For example, if you specify two input channels in the HuggingFace estimators fit call, named `train` and `test`, the environment variables `SM_CHANNEL_TRAIN` and `SM_CHANNEL_TEST` are set.
You can find a full list of the exposed environment variables [here](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md).
Later we define `hyperparameters` in the [HuggingFace Estimator](#create-an-huggingface-estimator), which are passed in as named arguments and and can be processed with the `ArgumentParser()`.
```python
import transformers
import datasets
import argparse
import os
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--per_device_train_batch_size", type=int, default=32)
parser.add_argument("--model_name_or_path", type=str)
# Data, model, and output directories
parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
```
_Note that SageMaker doesnt support argparse actions. For example, if you want to use a boolean hyperparameter, specify `type` as `bool` in your script and provide an explicit `True` or `False` value._
For a complete example of a 🤗 Transformers training script, see [train.py](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py)
### Create an HuggingFace Estimator
You run 🤗 Transformers training scripts on SageMaker by creating `HuggingFace` Estimators. The Estimator handles end-to-end Amazon SageMaker training. The training of your script is invoked when you call `fit` on a `HuggingFace` Estimator. In the Estimator you define, which fine-tuning script should be used as `entry_point`, which `instance_type` should be used, which `hyperparameters` are passed in, you can find all possible `HuggingFace` Parameter [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html#huggingface-estimator). and an example of a fine-tuning script [here](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py).
You can find all useable `instance_types` [here](https://aws.amazon.com/de/sagemaker/pricing/).
The following code sample shows how you train a custom `HuggingFace` script `train.py`, passing in three hyperparameters (`epochs`, `per_device_train_batch_size`, and `model_name_or_path`).
```python
from sagemaker.huggingface import HuggingFace
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
'per_device_train_batch_size': 32,
'model_name_or_path': 'distilbert-base-uncased'
}
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='train.py',
source_dir='./scripts',
instance_type='ml.p3.2xlarge',
instance_count=1,
role=role,
transformers_version='4.4',
pytorch_version='1.6',
py_version='py36',
hyperparameters = hyperparameters
)
```
To run the `TrainingJob` locally you can define `instance_type='local'` or `instance_type='local-gpu'` for gpu usage. _Note: this does not working within SageMaker Studio_
### Execute Training
You start your `TrainingJob` by calling `fit` on a `HuggingFace` Estimator. In the `fit` method you specify your input training data, like a string S3 URI `s3://my-bucket/my-training-data` or a `FileSystemInput` for [EFS or FSx Lustre](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs), see [here](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs).
```python
huggingface_estimator.fit(
{'train': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/train',
'test': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/test'}
)
```
SageMaker takes care of starting and managing all the required ec2 instances for ands starts the training job by running.
```bash
/opt/conda/bin/python train.py --epochs 1 --model_name_or_path distilbert-base-uncased --per_device_train_batch_size 32
```
### Access trained model
After training is done you can access your model either through the [AWS console](https://console.aws.amazon.com/console/home?nc2=h_ct&src=header-signin) or downloading it directly from S3.
```python
from sagemaker.s3 import S3Downloader
S3Downloader.download(
s3_uri=huggingface_estimator.model_data, # s3 uri where the trained model is located
local_path='.', # local path where *.targ.gz is saved
sagemaker_session=sess # sagemaker session used for training the model
)
```
---
## Sample Notebooks
You can find here a list of the official notebooks provided by Hugging Face.
| Notebook | Description |
| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
| [Getting Started Pytorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb) | End-to-End binary Text-Classification example using `Trainer` and `imdb` dataset |
| [Getting Started Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb) | End-to-End binary Text-Classification example using `Keras` and `imdb` dataset |
| [Distributed Training Data Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/03_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | End-to-End distributed Question-Answering example using `Trainer` and 🤗 Transformers example script for `SQAuD` |
| [Distributed Training Model Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb) | End-to-End model parallelism example using `SageMakerTrainer` and `run_glue.py` script |
| [Spot Instances and continues training](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb) | End-to-End to Text-Classification example using spot instances with continued training. |
| [SageMaker Metrics](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb) | End-to-End to Text-Classification example using SageMaker Metrics to extract and log metrics during training |
| [Distributed Training Data Parallelism Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | End-to-End distributed binary Text-Classification example using `Keras` and `TensorFlow`
| [Distributed Seq2Seq Training with Data Parallelism and BART](https://github.com/huggingface/notebooks/blob/master/sagemaker/08_distributed_summarization_bart_t5/sagemaker-notebook.ipynb) | End-to-End distributed summarization example `BART-large` and 🤗 Transformers example script for `summarization` |
---
## Advanced Features
In addition to the Deep Learning Container and the SageMaker SDK, we have implemented other additional features.
### Distributed Training: Data-Parallel
You can use [SageMaker Data Parallelism Library](https://aws.amazon.com/blogs/aws/managed-data-parallelism-in-amazon-sagemaker-simplifies-training-on-large-datasets/) out of the box for distributed training. We added the functionality of Data Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.
- [Example Notebook PyTorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
- [Example Notebook TensorFlow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb)
```python
# configuration for running training on smdistributed Data Parallel
distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='train.py',
source_dir='./scripts',
instance_type='ml.p3dn.24xlarge',
instance_count=2,
role=role,
transformers_version='4.4.2',
pytorch_version='1.6.0',
py_version='py36',
hyperparameters = hyperparameters
distribution = distribution
)
```
### Distributed Training: Model-Parallel
You can use [SageMaker Model Parallelism Library](https://aws.amazon.com/blogs/aws/amazon-sagemaker-simplifies-training-deep-learning-models-with-billions-of-parameters/) out of the box for distributed training. We added the functionality of Model Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.
For detailed information about the adjustments take a look [here](https://sagemaker.readthedocs.io/en/stable/api/training/smd_model_parallel_general.html?highlight=modelparallel#required-sagemaker-python-sdk-parameters).
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
```python
# configuration for running training on smdistributed Model Parallel
mpi_options = {
"enabled" : True,
"processes_per_host" : 8
}
smp_options = {
"enabled":True,
"parameters": {
"microbatches": 4,
"placement_strategy": "spread",
"pipeline": "interleaved",
"optimize": "speed",
"partitions": 4,
"ddp": True,
}
}
distribution={
"smdistributed": {"modelparallel": smp_options},
"mpi": mpi_options
}
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='train.py',
source_dir='./scripts',
instance_type='ml.p3dn.24xlarge',
instance_count=2,
role=role,
transformers_version='4.4.2',
pytorch_version='1.6.0',
py_version='py36',
hyperparameters = hyperparameters,
distribution = distribution
)
```
### Spot Instances
With the creation of HuggingFace Framework extension for the SageMaker Python SDK we can also leverage the benefit of [fully-managed EC2 spot instances](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html) and save up to 90% of our training cost.
_Note: Unless your training job completes quickly, we recommend you use [checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html) with managed spot training, therefore you need to define the `checkpoint_s3_uri`._
To use spot instances with the `HuggingFace` Estimator we have to set the `use_spot_instances` parameter to `True` and define your `max_wait` and `max_run` time. You can read more about the [managed spot training lifecycle here](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html).
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)
```python
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
'train_batch_size': 32,
'model_name':'distilbert-base-uncased',
'output_dir':'/opt/ml/checkpoints'
}
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='train.py',
source_dir='./scripts',
instance_type='ml.p3.2xlarge',
instance_count=1,
checkpoint_s3_uri=f's3://{sess.default_bucket()}/checkpoints'
use_spot_instances=True,
max_wait=3600, # This should be equal to or greater than max_run in seconds'
max_run=1000,
role=role,
transformers_version='4.4',
pytorch_version='1.6',
py_version='py36',
hyperparameters = hyperparameters
)
# Training seconds: 874
# Billable seconds: 262
# Managed Spot Training savings: 70.0%
```
### Git Repository
When you create a `HuggingFace` Estimator, you can specify a [training script that is stored in a GitHub repository](https://sagemaker.readthedocs.io/en/stable/overview.html#use-scripts-stored-in-a-git-repository) as the entry point for the estimator, so that you dont have to download the scripts locally. If Git support is enabled, the `entry_point` and `source_dir` should be relative paths in the Git repo if provided.
If you are using `git_config` to run the [🤗 Transformers examples scripts](https://github.com/huggingface/transformers/tree/master/examples) keep in mind that you need to configure the right `'branch'` for you `transformers_version`, e.g. if you use `transformers_version='4.4.2` you have to use `'branch':'v4.4.2'`.
As an example to use `git_config` with an [example script from the transformers repository](https://github.com/huggingface/transformers/tree/master/examples/text-classification).
_Tip: define `output_dir` as `/opt/ml/model` in the hyperparameter for the script to save your model to S3 after training._
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)
```python
# configure git settings
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.4.2'} # v4.4.2 is referring to the `transformers_version you use in the estimator.
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='run_glue.py',
source_dir='./examples/text-classification',
git_config=git_config,
instance_type='ml.p3.2xlarge',
instance_count=1,
role=role,
transformers_version='4.4',
pytorch_version='1.6',
py_version='py36',
hyperparameters=hyperparameters
)
```
### SageMaker Metrics
[SageMaker Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html#define-train-metrics) can automatically parse the logs for metrics and send those metrics to CloudWatch. If you want SageMaker to parse logs you have to specify the metrics that you want SageMaker to send to CloudWatch when you configure the training job. You specify the name of the metrics that you want to send and the regular expressions that SageMaker uses to parse the logs that your algorithm emits to find those metrics.
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)
```python
# define metrics definitions
metric_definitions = [
{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
]
# create the Estimator
huggingface_estimator = HuggingFace(
entry_point='train.py',
source_dir='./scripts',
instance_type='ml.p3.2xlarge',
instance_count=1,
role=role,
transformers_version='4.4',
pytorch_version='1.6',
py_version='py36',
metric_definitions=metric_definitions,
hyperparameters = hyperparameters)
```
## Additional Resources
- [Announcement Blog Post](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face)
- [AWS and Hugging Face collaborate to simplify and accelerate adoption of natural language processing](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-simplify-and-accelerate-adoption-of-natural-language-processing-models/)
- [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
- [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)

View File

@@ -54,11 +54,12 @@ Sequence Classification
Sequence classification is the task of classifying sequences according to a given number of classes. An example of
sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune a
model on a GLUE sequence classification task, you may leverage the :prefix_link:`run_glue.py
<examples/text-classification/run_glue.py>`, :prefix_link:`run_tf_glue.py
<examples/text-classification/run_tf_glue.py>`, :prefix_link:`run_tf_text_classification.py
<examples/text-classification/run_tf_text_classification.py>` or :prefix_link:`run_xnli.py
<examples/text-classification/run_xnli.py>` scripts.
model on a GLUE sequence classification task, you may leverage the `run_glue.py
<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`__ and
`run_pl_glue.py
<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_pl_glue.py>`__ or
`run_tf_glue.py
<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`__ scripts.
Here is an example of using pipelines to do sentiment analysis: identifying if a sequence is positive or negative. It
leverages a fine-tuned model on sst2, which is a GLUE task.
@@ -167,8 +168,9 @@ Extractive Question Answering
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune a
model on a SQuAD task, you may leverage the `run_qa.py
<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_qa.py>`__ and `run_tf_squad.py
model on a SQuAD task, you may leverage the `run_squad.py
<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_squad.py>`__ and
`run_tf_squad.py
<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_tf_squad.py>`__ scripts.
@@ -240,6 +242,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
... input_ids = inputs["input_ids"].tolist()[0]
...
... text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
... outputs = model(**inputs)
... answer_start_scores = outputs.start_logits
... answer_end_scores = outputs.end_logits
@@ -283,6 +286,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
... input_ids = inputs["input_ids"].numpy()[0]
...
... text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
... outputs = model(inputs)
... answer_start_scores = outputs.start_logits
... answer_end_scores = outputs.end_logits
@@ -324,9 +328,7 @@ Masked language modeling is the task of masking tokens in a sequence with a mask
fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis for
downstream tasks requiring bi-directional context, such as SQuAD (question answering, see `Lewis, Lui, Goyal et al.
<https://arxiv.org/abs/1910.13461>`__, part 4.2). If you would like to fine-tune a model on a masked language modeling
task, you may leverage the `run_mlm.py
<https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_mlm.py>`__ script.
<https://arxiv.org/abs/1910.13461>`__, part 4.2).
Here is an example of using pipelines to replace a mask from a sequence:
@@ -434,8 +436,7 @@ Causal Language Modeling
Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the
`run_clm.py <https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_clm.py>`__ script.
for generation tasks.
Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the
input sequence.
@@ -454,7 +455,7 @@ of tokens.
>>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
>>> model = AutoModelWithLMHead.from_pretrained("gpt2")
>>> sequence = f"Hugging Face is based in DUMBO, New York City, and"
>>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
>>> input_ids = tokenizer.encode(sequence, return_tensors="pt")
@@ -603,7 +604,11 @@ Named Entity Recognition (NER) is the task of classifying tokens according to a
as a person, an organisation or a location. An example of a named entity recognition dataset is the CoNLL-2003 dataset,
which is entirely based on that task. If you would like to fine-tune a model on an NER task, you may leverage the
`run_ner.py <https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_ner.py>`__
script.
(PyTorch), `run_pl_ner.py
<https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_pl_ner.py>`__ (leveraging
pytorch-lightning) or the `run_tf_ner.py
<https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_tf_ner.py>`__ (TensorFlow)
scripts.
Here is an example of using pipelines to do named entity recognition, specifically, trying to identify tokens as
belonging to one of 9 classes:
@@ -741,9 +746,7 @@ token. The following array should be the output:
Summarization
-----------------------------------------------------------------------------------------------------------------------
Summarization is the task of summarizing a document or an article into a shorter text. If you would like to fine-tune a
model on a summarization task, you may leverage the `run_summarization.py
<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_summarization.py>`__ script.
Summarization is the task of summarizing a document or an article into a shorter text.
An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was
created for the task of summarization. If you would like to fine-tune a model on a summarization task, various
@@ -821,9 +824,7 @@ CNN / Daily Mail), it yields very good results.
Translation
-----------------------------------------------------------------------------------------------------------------------
Translation is the task of translating a text from one language to another. If you would like to fine-tune a model on a
translation task, you may leverage the `run_translation.py
<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_translation.py>`__ script.
Translation is the task of translating a text from one language to another.
An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input
data and the corresponding sentences in German as the target data. If you would like to fine-tune a model on a

View File

@@ -151,6 +151,7 @@ As mentioned earlier you can see what tests are contained inside the ``Optimizat
pytest tests/test_optimization.py::OptimizationTest --collect-only -q
You can run tests by keyword expressions.
To run only tests whose name contains ``adam``:
@@ -159,9 +160,6 @@ To run only tests whose name contains ``adam``:
pytest -k adam tests/test_optimization.py
Logical ``and`` and ``or`` can be used to indicate whether all keywords should match or either. ``not`` can be used to
negate.
To run all tests except those whose name contains ``adam``:
.. code-block:: bash
@@ -170,24 +168,11 @@ To run all tests except those whose name contains ``adam``:
And you can combine the two patterns in one:
.. code-block:: bash
pytest -k "ada and not adam" tests/test_optimization.py
For example to run both ``test_adafactor`` and ``test_adam_w`` you can use:
.. code-block:: bash
pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
Note that we use ``or`` here, since we want either of the keywords to match to include both.
If you want to include only tests that include both patterns, ``and`` is to be used:
.. code-block:: bash
pytest -k "test and ada" tests/test_optimization.py
Run only modified tests

View File

@@ -1,5 +1,6 @@
<!---
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
@@ -15,13 +16,8 @@ limitations under the License.
# Examples
This folder contains actively maintained examples of use of 🤗 Transformers organized along NLP tasks. If you are looking for an example that used to be in this folder, it may have moved to our [research projects](https://github.com/huggingface/transformers/tree/master/examples/research_projects) subfolder (which contains frozen snapshots of research projects) or to the [legacy](https://github.com/huggingface/transformers/tree/master/examples/legacy) subfolder.
While we strive to present as many use cases as possible, the scripts in this folder are just examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, all the PyTorch versions of the examples fully expose the preprocessing of the data. This way, you can easily tweak them.
This is similar if you want the scripts to report another metric than the one they currently use: look at the `compute_metrics` function inside the script. It takes the full arrays of predictions and labels and has to return a dictionary of string keys and float values. Just change it to add (or replace) your own metric to the ones already reported.
Please discuss on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) a feature you would like to implement in an example before submitting a PR: we welcome bug fixes but since we want to keep the examples as simple as possible, it's unlikely we will merge a pull request adding more functionality at the cost of readability.
This folder contains actively maintained examples of use of 🤗 Transformers organized along NLP tasks. If you are looking for an example that used to
be in this folder, it may have moved to our [research projects](https://github.com/huggingface/transformers/tree/master/examples/research_projects) subfolder (which contains frozen snapshots of research projects).
## Important note
@@ -38,43 +34,10 @@ Then cd in the example folder of your choice and run
pip install -r requirements.txt
```
To browse the examples corresponding to released versions of 🤗 Transformers, click on the line below and then on your desired version of the library:
<details>
<summary>Examples for older versions of 🤗 Transformers</summary>
- [v4.3.3](https://github.com/huggingface/transformers/tree/v4.3.3/examples)
- [v4.2.2](https://github.com/huggingface/transformers/tree/v4.2.2/examples)
- [v4.1.1](https://github.com/huggingface/transformers/tree/v4.1.1/examples)
- [v4.0.1](https://github.com/huggingface/transformers/tree/v4.0.1/examples)
- [v3.5.1](https://github.com/huggingface/transformers/tree/v3.5.1/examples)
- [v3.4.0](https://github.com/huggingface/transformers/tree/v3.4.0/examples)
- [v3.3.1](https://github.com/huggingface/transformers/tree/v3.3.1/examples)
- [v3.2.0](https://github.com/huggingface/transformers/tree/v3.2.0/examples)
- [v3.1.0](https://github.com/huggingface/transformers/tree/v3.1.0/examples)
- [v3.0.2](https://github.com/huggingface/transformers/tree/v3.0.2/examples)
- [v2.11.0](https://github.com/huggingface/transformers/tree/v2.11.0/examples)
- [v2.10.0](https://github.com/huggingface/transformers/tree/v2.10.0/examples)
- [v2.9.1](https://github.com/huggingface/transformers/tree/v2.9.1/examples)
- [v2.8.0](https://github.com/huggingface/transformers/tree/v2.8.0/examples)
- [v2.7.0](https://github.com/huggingface/transformers/tree/v2.7.0/examples)
- [v2.6.0](https://github.com/huggingface/transformers/tree/v2.6.0/examples)
- [v2.5.1](https://github.com/huggingface/transformers/tree/v2.5.1/examples)
- [v2.4.0](https://github.com/huggingface/transformers/tree/v2.4.0/examples)
- [v2.3.0](https://github.com/huggingface/transformers/tree/v2.3.0/examples)
- [v2.2.0](https://github.com/huggingface/transformers/tree/v2.2.0/examples)
- [v2.1.1](https://github.com/huggingface/transformers/tree/v2.1.0/examples)
- [v2.0.0](https://github.com/huggingface/transformers/tree/v2.0.0/examples)
- [v1.2.0](https://github.com/huggingface/transformers/tree/v1.2.0/examples)
- [v1.1.0](https://github.com/huggingface/transformers/tree/v1.1.0/examples)
- [v1.0.0](https://github.com/huggingface/transformers/tree/v1.0.0/examples)
</details>
Alternatively, you can find switch your cloned 🤗 Transformers to a specific version (for instance with v3.5.1) with
Alternatively, you can run the version of the examples as they were for your current version of Transformers via (for instance with v3.5.1):
```bash
git checkout tags/v3.5.1
```
and run the example command as usual afterward.
## The Big Table of Tasks
@@ -90,47 +53,21 @@ Coming soon!
| Task | Example datasets | Trainer support | TFTrainer support | 🤗 Datasets | Colab
|---|---|:---:|:---:|:---:|:---:|
| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling) | WikiText-2 | ✅ | - | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)
| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice) | SWAG | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)
| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering) | SQuAD | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb)
| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq) | XSum | ✅ | - | | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb)
| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification) | GLUE | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling) | Raw text | ✅ | - | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice) | SWAG, RACE, ARC | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering) | SQuAD | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb)
| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq) | CNN/Daily Mail | ✅ | - | - | -
| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification) | GLUE, XNLI | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation) | - | n/a | n/a | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb)
| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq) | WMT | ✅ | - | | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb)
| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb)
| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq) | WMT | ✅ | - | - | -
## Running quick tests
Most examples are equipped with a mechanism to truncate the number of dataset samples to the desired length. This is useful for debugging purposes, for example to quickly check that all stages of the programs can complete, before running the same setup on the full dataset which may take hours to complete.
For example here is how to truncate all three splits to just 50 samples each:
```
examples/token-classification/run_ner.py \
--max_train_samples 50 \
--max_val_samples 50 \
--max_test_samples 50 \
[...]
```
Most example scripts should have the first two command line arguments and some have the third one. You can quickly check if a given example supports any of these by passing a `-h` option, e.g.:
```
examples/token-classification/run_ner.py -h
```
## Resuming training
You can resume training from a previous checkpoint like this:
1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance).
2. Pass `--model_name_or_path path_to_a_specific_checkpoint` to resume training from that checkpoint folder.
Should you want to turn an example into a notebook where you'd no longer have access to the command
line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`.
1. If `resume_from_checkpoint` is `True` it will look for the last checkpoint in the value of `output_dir` passed via `TrainingArguments`.
2. If `resume_from_checkpoint` is a path to a specific checkpoint it will use that saved checkpoint folder to resume the training from.
<!--
## One-click Deploy to Cloud (wip)
**Coming soon!**
-->
## Distributed training and mixed precision
@@ -141,7 +78,7 @@ use the following command:
```bash
python -m torch.distributed.launch \
--nproc_per_node number_of_gpu_you_have path_to_script.py \
--all_arguments_of_the_script
--all_arguments_of_the_script
```
As an example, here is how you would fine-tune the BERT large model (with whole word masking) on the text
@@ -185,7 +122,7 @@ regular training script with its arguments (this is similar to the `torch.distri
```bash
python xla_spawn.py --num_cores num_tpu_you_have \
path_to_script.py \
--all_arguments_of_the_script
--all_arguments_of_the_script
```
As an example, here is how you would fine-tune the BERT large model (with whole word masking) on the text
@@ -210,7 +147,7 @@ python xla_spawn.py --num_cores 8 \
You can easily log and monitor your runs code. The following are currently supported:
* [TensorBoard](https://www.tensorflow.org/tensorboard)
* [Weights & Biases](https://docs.wandb.ai/integrations/huggingface)
* [Weights & Biases](https://docs.wandb.com/library/integrations/huggingface)
* [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/)
### Weights & Biases
@@ -234,23 +171,9 @@ import wandb
wandb.login()
```
To enable logging to W&B, include `"wandb"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to all` if you have `wandb` installed.
Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metrics, model topology and gradients (for `Trainer` only) will automatically be logged.
Advanced configuration is possible by setting environment variables:
| Environment Variable | Value |
|---|---|
| WANDB_LOG_MODEL | Log the model as artifact (log the model as artifact at the end of training (`false` by default) |
| WANDB_WATCH | one of `gradients` (default) to log histograms of gradients, `all` to log histograms of both gradients and parameters, or `false` for no histogram logging |
| WANDB_PROJECT | Organize runs by project |
Set run names with `run_name` argument present in scripts or as part of `TrainingArguments`.
Additional configuration options are available through generic [wandb environment variables](https://docs.wandb.com/library/environment-variables).
Refer to related [documentation & examples](https://docs.wandb.ai/integrations/huggingface).
When using 🤗 Transformers with PyTorch Lightning, runs can be tracked through `WandbLogger`. Refer to related [documentation & examples](https://docs.wandb.com/library/integrations/lightning).
### Comet.ml

View File

@@ -2,7 +2,7 @@ tensorboard
scikit-learn
seqeval
psutil
sacrebleu >= 1.4.12
sacrebleu
rouge-score
tensorflow_datasets
matplotlib

1
examples/benchmarking/run_benchmark.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.

1
examples/benchmarking/run_benchmark_tf.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.

View File

@@ -22,11 +22,12 @@ ALBERT, BERT, DistilBERT, RoBERTa, XLNet... GPT and GPT-2 are trained or fine-tu
loss. XLNet uses permutation language modeling (PLM), you can find more information about the differences between those
objectives in our [model summary](https://huggingface.co/transformers/model_summary.html).
There are two sets of scripts provided. The first set leverages the Trainer API. The second set with `no_trainer` in the suffix uses a custom training loop and leverages the 🤗 Accelerate library . Both sets use the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
These scripts leverage the 🤗 Datasets library and the Trainer API. You can easily customize them to your needs if you
need extra processing on your datasets.
**Note:** The old script `run_language_modeling.py` is still available [here](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py).
The following examples, will run on datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
The following examples, will run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
text files for training and validation. We give examples of both below.
### GPT-2/GPT and causal language modeling
@@ -59,15 +60,6 @@ python run_clm.py \
--output_dir /tmp/test-clm
```
This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_clm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
```bash
python run_clm_no_trainer.py \
--dataset_name wikitext \
--dataset_config_name wikitext-2-raw-v1 \
--model_name_or_path gpt2 \
--output_dir /tmp/test-clm
```
### RoBERTa/BERT/DistilBERT and masked language modeling
@@ -103,33 +95,88 @@ python run_mlm.py \
If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
concatenates all texts and then splits them in blocks of the same length).
This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_mlm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
```bash
python run_mlm_no_trainer.py \
--dataset_name wikitext \
--dataset_config_name wikitext-2-raw-v1 \
--model_name_or_path roberta-base \
--output_dir /tmp/test-mlm
```
**Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
sure all your batches have the same length.
### Whole word masking
This part was moved to `examples/research_projects/mlm_wwm`.
The BERT authors released a new version of BERT using Whole Word Masking in May 2019. Instead of masking randomly
selected tokens (which may be part of words), they mask randomly selected words (masking all the tokens corresponding
to that word). This technique has been refined for Chinese in [this paper](https://arxiv.org/abs/1906.08101).
To fine-tune a model using whole word masking, use the following script:
```bash
python run_mlm_wwm.py \
--model_name_or_path roberta-base \
--dataset_name wikitext \
--dataset_config_name wikitext-2-raw-v1 \
--do_train \
--do_eval \
--output_dir /tmp/test-mlm-wwm
```
For Chinese models, we need to generate a reference files (which requires the ltp library), because it's tokenized at
the character level.
**Q :** Why a reference file?
**A :** Suppose we have a Chinese sentence like: `我喜欢你` The original Chinese-BERT will tokenize it as
`['我','喜','欢','你']` (character level). But `喜欢` is a whole word. For whole word masking proxy, we need a result
like `['我','喜','##欢','你']`, so we need a reference file to tell the model which position of the BERT original token
should be added `##`.
**Q :** Why LTP ?
**A :** Cause the best known Chinese WWM BERT is [Chinese-BERT-wwm](https://github.com/ymcui/Chinese-BERT-wwm) by HIT.
It works well on so many Chines Task like CLUE (Chinese GLUE). They use LTP, so if we want to fine-tune their model,
we need LTP.
Now LTP only only works well on `transformers==3.2.0`. So we don't add it to requirements.txt.
You need to create a separate environment with this version of Transformers to run the `run_chinese_ref.py` script that
will create the reference files. The script is in `examples/contrib`. Once in the proper environment, run the
following:
```bash
export TRAIN_FILE=/path/to/dataset/wiki.train.raw
export LTP_RESOURCE=/path/to/ltp/tokenizer
export BERT_RESOURCE=/path/to/bert/tokenizer
export SAVE_PATH=/path/to/data/ref.txt
python examples/contrib/run_chinese_ref.py \
--file_name=path_to_train_or_eval_file \
--ltp=path_to_ltp_tokenizer \
--bert=path_to_bert_tokenizer \
--save_path=path_to_reference_file
```
Then you can run the script like this:
```bash
python run_mlm_wwm.py \
--model_name_or_path roberta-base \
--train_file path_to_train_file \
--validation_file path_to_validation_file \
--train_ref_file path_to_train_chinese_ref_file \
--validation_ref_file path_to_validation_chinese_ref_file \
--do_train \
--do_eval \
--output_dir /tmp/test-mlm-wwm
```
**Note:** On TPU, you should the flag `--pad_to_max_length` to make sure all your batches have the same length.
### XLNet and permutation language modeling
XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method
to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input
XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method
to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input
sequence factorization order.
We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding
We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding
context length for permutation language modeling.
The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used
The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used
for permutation language modeling.
Here is how to fine-tune XLNet on wikitext-2:

127
examples/language-modeling/run_clm.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
@@ -43,13 +42,9 @@ from transformers import (
default_data_collator,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version
from transformers.trainer_utils import is_main_process
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.5.0")
logger = logging.getLogger(__name__)
@@ -118,21 +113,6 @@ class DataTrainingArguments:
default=None,
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_val_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
},
)
block_size: Optional[int] = field(
default=None,
metadata={
@@ -180,28 +160,23 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty."
"Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
@@ -213,7 +188,7 @@ def main():
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}")
logger.info("Training/evaluation parameters %s", training_args)
# Set seed before initializing model.
set_seed(training_args.seed)
@@ -247,11 +222,7 @@ def main():
data_files["train"] = data_args.train_file
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = (
data_args.train_file.split(".")[-1]
if data_args.train_file is not None
else data_args.validation_file.split(".")[-1]
)
extension = data_args.train_file.split(".")[-1]
if extension == "txt":
extension = "text"
datasets = load_dataset(extension, data_files=data_files)
@@ -365,7 +336,6 @@ def main():
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
@@ -373,26 +343,12 @@ def main():
load_from_cache_file=not data_args.overwrite_cache,
)
if training_args.do_train:
if "train" not in tokenized_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = lm_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
if training_args.do_eval:
if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_dataset = lm_datasets["validation"]
if data_args.max_val_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
train_dataset=lm_datasets["train"] if training_args.do_train else None,
eval_dataset=lm_datasets["validation"] if training_args.do_eval else None,
tokenizer=tokenizer,
# Data collator will default to DataCollatorWithPadding, so we change it.
data_collator=default_data_collator,
@@ -400,39 +356,44 @@ def main():
# Training
if training_args.do_train:
if last_checkpoint is not None:
checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
model_path = (
model_args.model_name_or_path
if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
else None
)
train_result = trainer.train(model_path=model_path)
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
if trainer.is_world_process_zero():
with open(output_train_file, "w") as writer:
logger.info("***** Train results *****")
for key, value in sorted(train_result.metrics.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
max_train_samples = (
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# Need to save the state, since Trainer.save_model saves only the tokenizer with the model
trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
# Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
eval_output = trainer.evaluate()
max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
perplexity = math.exp(metrics["eval_loss"])
metrics["perplexity"] = perplexity
perplexity = math.exp(eval_output["eval_loss"])
results["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
output_eval_file = os.path.join(training_args.output_dir, "eval_results_clm.txt")
if trainer.is_world_process_zero():
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key, value in sorted(results.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
return results
def _mp_fn(index):

View File

@@ -1,456 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for causal language modeling (BERT, ALBERT, RoBERTa...)
on a text file or a dataset without using HuggingFace Trainer.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=causal-lm
"""
# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
import argparse
import logging
import math
import os
import random
import datasets
import torch
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
AdamW,
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
SchedulerType,
default_data_collator,
get_scheduler,
set_seed,
)
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def parse_args():
parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
parser.add_argument(
"--dataset_name",
type=str,
default=None,
help="The name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--dataset_config_name",
type=str,
default=None,
help="The configuration name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--train_file", type=str, default=None, help="A csv or a json file containing the training data."
)
parser.add_argument(
"--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
)
parser.add_argument(
"--validation_split_percentage",
default=5,
help="The percentage of the train set used as validation set in case there's no validation split",
)
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to pretrained model or model identifier from huggingface.co/models.",
required=True,
)
parser.add_argument(
"--config_name",
type=str,
default=None,
help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
"--tokenizer_name",
type=str,
default=None,
help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
"--use_slow_tokenizer",
action="store_true",
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
)
parser.add_argument(
"--per_device_train_batch_size",
type=int,
default=8,
help="Batch size (per device) for the training dataloader.",
)
parser.add_argument(
"--per_device_eval_batch_size",
type=int,
default=8,
help="Batch size (per device) for the evaluation dataloader.",
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
parser.add_argument(
"--max_train_steps",
type=int,
default=None,
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
)
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
"--lr_scheduler_type",
type=SchedulerType,
default="linear",
help="The scheduler type to use.",
choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
)
parser.add_argument(
"--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
)
parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--model_type",
type=str,
default=None,
help="Model type to use if training from scratch.",
choices=MODEL_TYPES,
)
parser.add_argument(
"--block_size",
type=int,
default=None,
help="Optional input sequence length after tokenization. The training dataset will be truncated in block of this size for training. Default to the model max input length for single sentence inputs (take into account special tokens).",
)
parser.add_argument(
"--preprocessing_num_workers",
type=int,
default=None,
help="The number of processes to use for the preprocessing.",
)
parser.add_argument(
"--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
)
args = parser.parse_args()
# Sanity checks
if args.dataset_name is None and args.train_file is None and args.validation_file is None:
raise ValueError("Need either a dataset name or a training/validation file.")
else:
if args.train_file is not None:
extension = args.train_file.split(".")[-1]
assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
if args.validation_file is not None:
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
def main():
args = parse_args()
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)
# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
# If passed along, set the training seed now.
if args.seed is not None:
set_seed(args.seed)
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
args.dataset_name,
args.dataset_config_name,
split=f"train[:{args.validation_split_percentage}%]",
)
raw_datasets["train"] = load_dataset(
args.dataset_name,
args.dataset_config_name,
split=f"train[{args.validation_split_percentage}%:]",
)
else:
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
config = CONFIG_MAPPING[args.model_type]()
logger.warning("You are instantiating a new config instance from scratch.")
if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
elif args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if args.model_name_or_path:
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
)
else:
logger.info("Training new model from scratch")
model = AutoModelForCausalLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
def tokenize_function(examples):
return tokenizer(examples[text_column_name])
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
)
if args.block_size is None:
block_size = tokenizer.model_max_length
if block_size > 1024:
logger.warn(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --block_size xxx."
)
block_size = 1024
else:
if args.block_size > tokenizer.model_max_length:
logger.warn(
f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
)
block_size = min(args.block_size, tokenizer.model_max_length)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache,
)
train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# DataLoaders creation:
train_dataloader = DataLoader(
train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
)
eval_dataloader = DataLoader(
eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=args.max_train_steps,
)
# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}")
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
for epoch in range(args.num_train_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
completed_steps += 1
if completed_steps >= args.max_train_steps:
break
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
losses = torch.cat(losses)
losses = losses[: len(eval_dataset)]
perplexity = math.exp(torch.mean(losses))
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if __name__ == "__main__":
main()

149
examples/language-modeling/run_mlm.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
@@ -43,13 +42,9 @@ from transformers import (
TrainingArguments,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version
from transformers.trainer_utils import is_main_process
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.5.0")
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@@ -150,20 +145,6 @@ class DataTrainingArguments:
"If False, will pad the samples dynamically when batching to the maximum length in the batch."
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_val_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -190,28 +171,23 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty."
"Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
@@ -223,7 +199,7 @@ def main():
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}")
logger.info("Training/evaluation parameters %s", training_args)
# Set seed before initializing model.
set_seed(training_args.seed)
@@ -321,22 +297,6 @@ def main():
column_names = datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
if data_args.max_seq_length is None:
max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024:
logger.warn(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
)
max_seq_length = 1024
else:
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
if data_args.line_by_line:
# When using line_by_line, we just tokenize each nonempty line.
padding = "max_length" if data_args.pad_to_max_length else False
@@ -348,7 +308,7 @@ def main():
examples["text"],
padding=padding,
truncation=True,
max_length=max_seq_length,
max_length=data_args.max_seq_length,
# We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
# receives the `special_tokens_mask`.
return_special_tokens_mask=True,
@@ -376,6 +336,16 @@ def main():
load_from_cache_file=not data_args.overwrite_cache,
)
if data_args.max_seq_length is None:
max_seq_length = tokenizer.model_max_length
else:
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
@@ -398,7 +368,6 @@ def main():
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
@@ -406,20 +375,6 @@ def main():
load_from_cache_file=not data_args.overwrite_cache,
)
if training_args.do_train:
if "train" not in tokenized_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = tokenized_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
if training_args.do_eval:
if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_dataset = tokenized_datasets["validation"]
if data_args.max_val_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
# Data collator
# This one will take care of randomly masking the tokens.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
@@ -428,46 +383,52 @@ def main():
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=data_collator,
)
# Training
if training_args.do_train:
if last_checkpoint is not None:
checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
max_train_samples = (
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
model_path = (
model_args.model_name_or_path
if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
else None
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
train_result = trainer.train(model_path=model_path)
trainer.save_model() # Saves the tokenizer too for easy upload
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
if trainer.is_world_process_zero():
with open(output_train_file, "w") as writer:
logger.info("***** Train results *****")
for key, value in sorted(train_result.metrics.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
# Need to save the state, since Trainer.save_model saves only the tokenizer with the model
trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
# Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
eval_output = trainer.evaluate()
max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
perplexity = math.exp(metrics["eval_loss"])
metrics["perplexity"] = perplexity
perplexity = math.exp(eval_output["eval_loss"])
results["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt")
if trainer.is_world_process_zero():
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key, value in sorted(results.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
return results
def _mp_fn(index):

13
examples/language-modeling/run_mlm_flax.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
@@ -307,7 +306,7 @@ def create_learning_rate_scheduler(
progress = jnp.maximum(0.0, (step - warmup_steps) / float(steps_per_cycle))
ret *= jnp.maximum(0.0, 0.5 * (1.0 + jnp.cos(jnp.pi * (progress % 1.0))))
else:
raise ValueError(f"Unknown factor {name}.")
raise ValueError("Unknown factor %s." % name)
return jnp.asarray(ret, dtype=jnp.float32)
return step_fn
@@ -332,7 +331,9 @@ def accuracy(logits, targets, weights=None):
Tuple of scalar loss and batch normalizing factor.
"""
if logits.ndim != targets.ndim + 1:
raise ValueError(f"Incorrect shapes. Got shape {logits.shape} logits and {targets.shape} targets")
raise ValueError(
"Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
)
loss = jnp.equal(jnp.argmax(logits, axis=-1), targets)
loss *= weights
@@ -351,7 +352,9 @@ def cross_entropy(logits, targets, weights=None, label_smoothing=0.0):
Tuple of scalar loss and batch normalizing factor.
"""
if logits.ndim != targets.ndim + 1:
raise ValueError(f"Incorrect shapes. Got shape {logits.shape} logits and {targets.shape} targets")
raise ValueError(
"Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
)
vocab_size = logits.shape[-1]
confidence = 1.0 - label_smoothing
@@ -459,7 +462,7 @@ if __name__ == "__main__":
)
# Set the verbosity to info of the Transformers logger (on main process only):
logger.info(f"Training/evaluation parameters {training_args}")
logger.info("Training/evaluation parameters %s", training_args)
# Set seed before initializing model.
set_seed(training_args.seed)

View File

@@ -1,500 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
on a text file or a dataset without using HuggingFace Trainer.
Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
https://huggingface.co/models?filter=masked-lm
"""
# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
import argparse
import logging
import math
import os
import random
import datasets
import torch
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
AdamW,
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
SchedulerType,
get_scheduler,
set_seed,
)
logger = logging.getLogger(__name__)
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def parse_args():
parser = argparse.ArgumentParser(description="Finetune a transformers model on a Masked Language Modeling task")
parser.add_argument(
"--dataset_name",
type=str,
default=None,
help="The name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--dataset_config_name",
type=str,
default=None,
help="The configuration name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--train_file", type=str, default=None, help="A csv or a json file containing the training data."
)
parser.add_argument(
"--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
)
parser.add_argument(
"--validation_split_percentage",
default=5,
help="The percentage of the train set used as validation set in case there's no validation split",
)
parser.add_argument(
"--pad_to_max_length",
action="store_true",
help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
)
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to pretrained model or model identifier from huggingface.co/models.",
required=True,
)
parser.add_argument(
"--config_name",
type=str,
default=None,
help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
"--tokenizer_name",
type=str,
default=None,
help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
"--use_slow_tokenizer",
action="store_true",
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
)
parser.add_argument(
"--per_device_train_batch_size",
type=int,
default=8,
help="Batch size (per device) for the training dataloader.",
)
parser.add_argument(
"--per_device_eval_batch_size",
type=int,
default=8,
help="Batch size (per device) for the evaluation dataloader.",
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
parser.add_argument(
"--max_train_steps",
type=int,
default=None,
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
)
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
"--lr_scheduler_type",
type=SchedulerType,
default="linear",
help="The scheduler type to use.",
choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
)
parser.add_argument(
"--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
)
parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--model_type",
type=str,
default=None,
help="Model type to use if training from scratch.",
choices=MODEL_TYPES,
)
parser.add_argument(
"--max_seq_length",
type=int,
default=None,
help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.",
)
parser.add_argument(
"--line_by_line",
type=bool,
default=False,
help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
)
parser.add_argument(
"--preprocessing_num_workers",
type=int,
default=None,
help="The number of processes to use for the preprocessing.",
)
parser.add_argument(
"--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
)
parser.add_argument(
"--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
)
args = parser.parse_args()
# Sanity checks
if args.dataset_name is None and args.train_file is None and args.validation_file is None:
raise ValueError("Need either a dataset name or a training/validation file.")
else:
if args.train_file is not None:
extension = args.train_file.split(".")[-1]
assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
if args.validation_file is not None:
extension = args.validation_file.split(".")[-1]
assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
def main():
args = parse_args()
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)
# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
# If passed along, set the training seed now.
if args.seed is not None:
set_seed(args.seed)
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
args.dataset_name,
args.dataset_config_name,
split=f"train[:{args.validation_split_percentage}%]",
)
raw_datasets["train"] = load_dataset(
args.dataset_name,
args.dataset_config_name,
split=f"train[{args.validation_split_percentage}%:]",
)
else:
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
config = CONFIG_MAPPING[args.model_type]()
logger.warning("You are instantiating a new config instance from scratch.")
if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
elif args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if args.model_name_or_path:
model = AutoModelForMaskedLM.from_pretrained(
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
)
else:
logger.info("Training new model from scratch")
model = AutoModelForMaskedLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
if args.max_seq_length is None:
max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024:
logger.warn(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
)
max_seq_length = 1024
else:
if args.max_seq_length > tokenizer.model_max_length:
logger.warn(
f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
if args.line_by_line:
# When using line_by_line, we just tokenize each nonempty line.
padding = "max_length" if args.pad_to_max_length else False
def tokenize_function(examples):
# Remove empty lines
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(
examples["text"],
padding=padding,
truncation=True,
max_length=max_seq_length,
# We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
# receives the `special_tokens_mask`.
return_special_tokens_mask=True,
)
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=[text_column_name],
load_from_cache_file=not args.overwrite_cache,
)
else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# efficient when it receives the `special_tokens_mask`.
def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len.
result = {
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
for k, t in concatenated_examples.items()
}
return result
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=args.preprocessing_num_workers,
load_from_cache_file=not args.overwrite_cache,
)
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# Data collator
# This one will take care of randomly masking the tokens.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability)
# DataLoaders creation:
train_dataloader = DataLoader(
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=args.max_train_steps,
)
# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}")
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
for epoch in range(args.num_train_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
completed_steps += 1
if completed_steps >= args.max_train_steps:
break
model.eval()
losses = []
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
losses = torch.cat(losses)
losses = losses[: len(eval_dataset)]
perplexity = math.exp(torch.mean(losses))
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if __name__ == "__main__":
main()

View File

@@ -44,7 +44,7 @@ from transformers import (
TrainingArguments,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.trainer_utils import is_main_process
logger = logging.getLogger(__name__)
@@ -184,28 +184,23 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty."
"Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
@@ -337,10 +332,6 @@ def main():
tokenized_datasets["validation"] = add_chinese_references(
tokenized_datasets["validation"], data_args.validation_ref_file
)
# If we have ref files, need to avoid it removed by trainer
has_ref = data_args.train_ref_file or data_args.validation_ref_file
if has_ref:
training_args.remove_unused_columns = False
# Data collator
# This one will take care of randomly masking the tokens.
@@ -358,13 +349,12 @@ def main():
# Training
if training_args.do_train:
if last_checkpoint is not None:
checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
model_path = (
model_args.model_name_or_path
if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
else None
)
train_result = trainer.train(model_path=model_path)
trainer.save_model() # Saves the tokenizer too for easy upload
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")

137
examples/language-modeling/run_plm.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
@@ -39,13 +38,9 @@ from transformers import (
XLNetLMHeadModel,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version
from transformers.trainer_utils import is_main_process
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.5.0")
logger = logging.getLogger(__name__)
@@ -147,20 +142,6 @@ class DataTrainingArguments:
"If False, will pad the samples dynamically when batching to the maximum length in the batch."
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_val_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
},
)
def __post_init__(self):
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -187,28 +168,23 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty."
"Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
@@ -220,7 +196,7 @@ def main():
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}")
logger.info("Training/evaluation parameters %s", training_args)
# Set seed before initializing model.
set_seed(training_args.seed)
@@ -318,13 +294,6 @@ def main():
column_names = datasets["validation"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
if data_args.line_by_line:
# When using line_by_line, we just tokenize each nonempty line.
padding = "max_length" if data_args.pad_to_max_length else False
@@ -332,7 +301,7 @@ def main():
def tokenize_function(examples):
# Remove empty lines
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
tokenized_datasets = datasets.map(
tokenize_function,
@@ -354,6 +323,13 @@ def main():
load_from_cache_file=not data_args.overwrite_cache,
)
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
@@ -376,7 +352,6 @@ def main():
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
@@ -384,20 +359,6 @@ def main():
load_from_cache_file=not data_args.overwrite_cache,
)
if training_args.do_train:
if "train" not in tokenized_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = tokenized_datasets["train"]
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
if training_args.do_eval:
if "validation" not in tokenized_datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_dataset = tokenized_datasets["validation"]
if data_args.max_val_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
# Data collator
data_collator = DataCollatorForPermutationLanguageModeling(
tokenizer=tokenizer,
@@ -409,46 +370,52 @@ def main():
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=data_collator,
)
# Training
if training_args.do_train:
if last_checkpoint is not None:
checkpoint = last_checkpoint
elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
max_train_samples = (
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
model_path = (
model_args.model_name_or_path
if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
else None
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
train_result = trainer.train(model_path=model_path)
trainer.save_model() # Saves the tokenizer too for easy upload
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
if trainer.is_world_process_zero():
with open(output_train_file, "w") as writer:
logger.info("***** Train results *****")
for key, value in sorted(train_result.metrics.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
# Need to save the state, since Trainer.save_model saves only the tokenizer with the model
trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
# Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
eval_output = trainer.evaluate()
max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
perplexity = math.exp(metrics["eval_loss"])
metrics["perplexity"] = perplexity
perplexity = math.exp(eval_output["eval_loss"])
results["perplexity"] = perplexity
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt")
if trainer.is_world_process_zero():
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key, value in sorted(results.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
return results
def _mp_fn(index):

View File

@@ -28,7 +28,6 @@ from transformers import (
AutoConfig,
AutoModelForMultipleChoice,
AutoTokenizer,
DataCollatorWithPadding,
EvalPrediction,
HfArgumentParser,
Trainer,
@@ -189,9 +188,6 @@ def main():
preds = np.argmax(p.predictions, axis=1)
return {"acc": simple_accuracy(preds, p.label_ids)}
# Data collator
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
# Initialize our Trainer
trainer = Trainer(
model=model,
@@ -199,7 +195,6 @@ def main():
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
data_collator=data_collator,
)
# Training

View File

@@ -23,14 +23,7 @@ from dataclasses import dataclass, field
from typing import Optional
import transformers
from transformers import (
AutoConfig,
AutoModelForQuestionAnswering,
AutoTokenizer,
DataCollatorWithPadding,
HfArgumentParser,
SquadDataset,
)
from transformers import AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer, HfArgumentParser, SquadDataset
from transformers import SquadDataTrainingArguments as DataTrainingArguments
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import is_main_process
@@ -152,16 +145,12 @@ def main():
else None
)
# Data collator
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
)
# Training

1
examples/legacy/run_camembert.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
import torch
from transformers import CamembertForMaskedLM, CamembertTokenizer

1
examples/legacy/run_chinese_ref.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
import argparse
import json
from typing import List

1
examples/legacy/run_language_modeling.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.

1
examples/legacy/run_openai_gpt.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.

1
examples/legacy/run_swag.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.

1
examples/legacy/run_transfo_xl.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.

View File

@@ -1,334 +0,0 @@
<!---
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Sequence-to-Sequence Training and Evaluation
This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.com/huggingface/transformers/blob/master/examples/research_projects/bertabs/README.md).
### Supported Architectures
- `BartForConditionalGeneration`
- `MarianMTModel`
- `PegasusForConditionalGeneration`
- `MBartForConditionalGeneration`
- `FSMTForConditionalGeneration`
- `T5ForConditionalGeneration`
### Downlowd the Datasets
#### XSUM
```bash
cd examples/legacy/seq2seq
wget https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz
tar -xzvf xsum.tar.gz
export XSUM_DIR=${PWD}/xsum
```
this should make a directory called `xsum/` with files like `test.source`.
To use your own data, copy that files format. Each article to be summarized is on its own line.
#### CNN/DailyMail
```bash
cd examples/legacy/seq2seq
wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz
tar -xzvf cnn_dm_v2.tgz # empty lines removed
mv cnn_cln cnn_dm
export CNN_DIR=${PWD}/cnn_dm
```
this should make a directory called `cnn_dm/` with 6 files.
#### WMT16 English-Romanian Translation Data
download with this command:
```bash
wget https://cdn-datasets.huggingface.co/translation/wmt_en_ro.tar.gz
tar -xzvf wmt_en_ro.tar.gz
export ENRO_DIR=${PWD}/wmt_en_ro
```
this should make a directory called `wmt_en_ro/` with 6 files.
#### WMT English-German
```bash
wget https://cdn-datasets.huggingface.co/translation/wmt_en_de.tgz
tar -xzvf wmt_en_de.tgz
export DATA_DIR=${PWD}/wmt_en_de
```
#### FSMT datasets (wmt)
Refer to the scripts starting with `eval_` under:
https://github.com/huggingface/transformers/tree/master/scripts/fsmt
#### Pegasus (multiple datasets)
Multiple eval datasets are available for download from:
https://github.com/stas00/porting/tree/master/datasets/pegasus
#### Your Data
If you are using your own data, it must be formatted as one directory with 6 files:
```
train.source
train.target
val.source
val.target
test.source
test.target
```
The `.source` files are the input, the `.target` files are the desired output.
### Potential issues
- native AMP (`--fp16` and no apex) may lead to a huge memory leak and require 10x gpu memory. This has been fixed in pytorch-nightly and the minimal official version to have this fix will be pytorch-1.7.1. Until then if you have to use mixed precision please use AMP only with pytorch-nightly or NVIDIA's apex. Reference: https://github.com/huggingface/transformers/issues/8403
### Tips and Tricks
General Tips:
- since you need to run from `examples/legacy/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started.
- try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size. (3hr per epoch with bs=8, see the "xsum_shared_task" command below)
- `fp16_opt_level=O1` (the default works best).
- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
- This warning can be safely ignored:
> "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']"
- Both finetuning and eval are 30% faster with `--fp16`. For that you need to [install apex](https://github.com/NVIDIA/apex#quick-start).
- Read scripts before you run them!
Summarization Tips:
- (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100.
- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.
- For CNN/DailyMail, the default `val_max_target_length` and `test_max_target_length` will truncate the ground truth labels, resulting in slightly higher rouge scores. To get accurate rouge scores, you should rerun calculate_rouge on the `{output_dir}/test_generations.txt` file saved by `trainer.test()`
- `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM.
- `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task.
- If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
**Update 2018-07-18**
Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
Future work/help wanted: A new dataset to support multilingual tasks.
### Fine-tuning using Seq2SeqTrainer
To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer`-related `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument.
With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set.
To see all the possible command line options, run:
```bash
python finetune_trainer.py --help
```
For multi-gpu training use `torch.distributed.launch`, e.g. with 2 gpus:
```bash
python -m torch.distributed.launch --nproc_per_node=2 finetune_trainer.py ...
```
**At the moment, `Seq2SeqTrainer` does not support *with teacher* distillation.**
All `Seq2SeqTrainer`-based fine-tuning scripts are included in the `builtin_trainer` directory.
#### TPU Training
`Seq2SeqTrainer` supports TPU training with few caveats
1. As `generate` method does not work on TPU at the moment, `predict_with_generate` cannot be used. You should use `--prediction_loss_only` to only calculate loss, and do not set `--do_predict` and `--predict_with_generate`.
2. All sequences should be padded to be of equal length to avoid extremely slow training. (`finetune_trainer.py` does this automatically when running on TPU.)
We provide a very simple launcher script named `xla_spawn.py` that lets you run our example scripts on multiple TPU cores without any boilerplate. Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for `torch.distributed`).
`builtin_trainer/finetune_tpu.sh` script provides minimal arguments needed for TPU training.
The following command fine-tunes `sshleifer/student_marian_en_ro_6_3` on TPU V3-8 and should complete one epoch in ~5-6 mins.
```bash
./builtin_trainer/train_distil_marian_enro_tpu.sh
```
## Evaluation Commands
To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.
If 'translation' is in your task name, the computed metric will be BLEU. Otherwise, ROUGE will be used.
For t5, you need to specify --task translation_{src}_to_{tgt} as follows:
```bash
export DATA_DIR=wmt_en_ro
./run_eval.py t5-base \
$DATA_DIR/val.source t5_val_generations.txt \
--reference_path $DATA_DIR/val.target \
--score_path enro_bleu.json \
--task translation_en_to_ro \
--n_obs 100 \
--device cuda \
--fp16 \
--bs 32
```
This command works for MBART, although the BLEU score is suspiciously low.
```bash
export DATA_DIR=wmt_en_ro
./run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \
--reference_path $DATA_DIR/val.target \
--score_path enro_bleu.json \
--task translation \
--n_obs 100 \
--device cuda \
--fp16 \
--bs 32
```
Summarization (xsum will be very similar):
```bash
export DATA_DIR=cnn_dm
./run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \
--reference_path $DATA_DIR/val.target \
--score_path cnn_rouge.json \
--task summarization \
--n_obs 100 \
th 56 \
--fp16 \
--bs 32
```
### Multi-GPU Evaluation
here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases
because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
```bash
python -m torch.distributed.launch --nproc_per_node=8 run_distributed_eval.py \
--model_name sshleifer/distilbart-large-xsum-12-3 \
--save_dir xsum_generations \
--data_dir xsum \
--fp16 # you can pass generate kwargs like num_beams here, just like run_eval.py
```
Contributions that implement this command for other distributed hardware setups are welcome!
#### Single-GPU Eval: Tips and Tricks
When using `run_eval.py`, the following features can be useful:
* if you running the script multiple times and want to make it easier to track what arguments produced that output, use `--dump-args`. Along with the results it will also dump any custom params that were passed to the script. For example if you used: `--num_beams 8 --early_stopping true`, the output will be:
```
{'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True}
```
`--info` is an additional argument available for the same purpose of tracking the conditions of the experiment. It's useful to pass things that weren't in the argument list, e.g. a language pair `--info "lang:en-ru"`. But also if you pass `--info` without a value it will fallback to the current date/time string, e.g. `2020-09-13 18:44:43`.
If using `--dump-args --info`, the output will be:
```
{'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': '2020-09-13 18:44:43'}
```
If using `--dump-args --info "pair:en-ru chkpt=best`, the output will be:
```
{'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': 'pair=en-ru chkpt=best'}
```
* if you need to perform a parametric search in order to find the best ones that lead to the highest BLEU score, let `run_eval_search.py` to do the searching for you.
The script accepts the exact same arguments as `run_eval.py`, plus an additional argument `--search`. The value of `--search` is parsed, reformatted and fed to ``run_eval.py`` as additional args.
The format for the `--search` value is a simple string with hparams and colon separated values to try, e.g.:
```
--search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
```
which will generate `12` `(2*3*2)` searches for a product of each hparam. For example the example that was just used will invoke `run_eval.py` repeatedly with:
```
--num_beams 5 --length_penalty 0.8 --early_stopping true
--num_beams 5 --length_penalty 0.8 --early_stopping false
[...]
--num_beams 10 --length_penalty 1.2 --early_stopping false
```
On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
```
bleu | num_beams | length_penalty | early_stopping
----- | --------- | -------------- | --------------
26.71 | 5 | 1.1 | 1
26.66 | 5 | 0.9 | 1
26.66 | 5 | 0.9 | 0
26.41 | 5 | 1.1 | 0
21.94 | 1 | 0.9 | 1
21.94 | 1 | 0.9 | 0
21.94 | 1 | 1.1 | 1
21.94 | 1 | 1.1 | 0
Best score args:
stas/wmt19-en-ru data/en-ru/val.source data/en-ru/test_translations.txt --reference_path data/en-ru/val.target --score_path data/en-ru/test_bleu.json --bs 8 --task translation --num_beams 5 --length_penalty 1.1 --early_stopping True
```
If you pass `--info "some experiment-specific info"` it will get printed before the results table - this is useful for scripting and multiple runs, so one can tell the different sets of results from each other.
### Contributing
- follow the standard contributing guidelines and code of conduct.
- add tests to `test_seq2seq_examples.py`
- To run only the seq2seq tests, you must be in the root of the repository and run:
```bash
pytest examples/seq2seq/
```
### Converting pytorch-lightning checkpoints
pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
This should be done for you, with a file called `{save_dir}/best_tfmr`.
If that file doesn't exist but you have a lightning `.ckpt` file, you can run
```bash
python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT randomly_initialized_hf_model_path save_dir/best_tfmr
```
Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
# Experimental Features
These features are harder to use and not always useful.
### Dynamic Batch Size for MT
`finetune.py` has a command line arg `--max_tokens_per_batch` that allows batches to be dynamically sized.
This feature can only be used:
- with fairseq installed
- on 1 GPU
- without sortish sampler
- after calling `./save_len_file.py $tok $data_dir`
For example,
```bash
./save_len_file.py Helsinki-NLP/opus-mt-en-ro wmt_en_ro
./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
```
splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100.
For comparison,
```bash
./dynamic_bs_example.sh --sortish_sampler --train_batch_size 48
```
uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
The feature is still experimental, because:
+ we can make it much more robust if we have memory mapped/preprocessed datasets.
+ The speedup over sortish sampler is not that large at the moment.

View File

@@ -1,20 +0,0 @@
tensorboard
scikit-learn
seqeval
psutil
sacrebleu
rouge-score
tensorflow_datasets
matplotlib
git-python==1.0.3
faiss-cpu
streamlit
elasticsearch
nltk
pandas
datasets >= 1.1.3
fire
pytest
conllu
sentencepiece != 0.1.92
protobuf

View File

@@ -129,71 +129,6 @@ On the test dataset the following results could be achieved:
10/04/2019 00:42:42 - INFO - __main__ - recall = 0.8624150210424085
```
#### Run the Tensorflow 2 version
To start training, just run:
```bash
python3 run_tf_ner.py --data_dir ./ \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
```
Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
#### Evaluation
Evaluation on development dataset outputs the following for our example:
```bash
precision recall f1-score support
LOCderiv 0.7619 0.6154 0.6809 52
PERpart 0.8724 0.8997 0.8858 4057
OTHpart 0.9360 0.9466 0.9413 711
ORGpart 0.7015 0.6989 0.7002 269
LOCpart 0.7668 0.8488 0.8057 496
LOC 0.8745 0.9191 0.8963 235
ORGderiv 0.7723 0.8571 0.8125 91
OTHderiv 0.4800 0.6667 0.5581 18
OTH 0.5789 0.6875 0.6286 16
PERderiv 0.5385 0.3889 0.4516 18
PER 0.5000 0.5000 0.5000 2
ORG 0.0000 0.0000 0.0000 3
micro avg 0.8574 0.8862 0.8715 5968
macro avg 0.8575 0.8862 0.8713 5968
```
On the test dataset the following results could be achieved:
```bash
precision recall f1-score support
PERpart 0.8847 0.8944 0.8896 9397
OTHpart 0.9376 0.9353 0.9365 1639
ORGpart 0.7307 0.7044 0.7173 697
LOC 0.9133 0.9394 0.9262 561
LOCpart 0.8058 0.8157 0.8107 1150
ORG 0.0000 0.0000 0.0000 8
OTHderiv 0.5882 0.4762 0.5263 42
PERderiv 0.6571 0.5227 0.5823 44
OTH 0.4906 0.6667 0.5652 39
ORGderiv 0.7016 0.7791 0.7383 172
LOCderiv 0.8256 0.6514 0.7282 109
PER 0.0000 0.0000 0.0000 11
micro avg 0.8722 0.8774 0.8748 13869
macro avg 0.8712 0.8774 0.8740 13869
```
### Emerging and Rare Entities task: WNUT17 (English NER) dataset
Description of the WNUT17 task from the [shared task website](http://noisy-text.github.io/2017/index.html):

View File

@@ -30,7 +30,6 @@ from transformers import (
AutoConfig,
AutoModelForTokenClassification,
AutoTokenizer,
DataCollatorWithPadding,
EvalPrediction,
HfArgumentParser,
Trainer,
@@ -238,9 +237,6 @@ def main():
"f1": f1_score(out_label_list, preds_list),
}
# Data collator
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
# Initialize our Trainer
trainer = Trainer(
model=model,
@@ -248,7 +244,6 @@ def main():
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
data_collator=data_collator,
)
# Training

View File

@@ -14,13 +14,11 @@ See the License for the specific language governing permissions and
limitations under the License.
-->
# Multiple Choice
## Multiple Choice
Based on the script [`run_swag.py`]().
## PyTorch script: fine-tuning on SWAG
`run_swag` allows you to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on the SWAG dataset or your own csv/jsonlines files as long as they are structured the same way. To make it works on another dataset, you will need to tweak the `preprocess_function` inside the script.
#### Fine-tuning on SWAG
```bash
python examples/multiple-choice/run_swag.py \
@@ -41,73 +39,6 @@ eval_acc = 0.8338998300509847
eval_loss = 0.44457291918821606
```
## PyTorch version, no Trainer
Based on the script [run_ner_no_trainer.py](https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/run_swag_no_trainer.py).
Like `run_swag.py`, this script allows you to fine-tune any of the models on the [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on
the SWAG dataset or your own data in a csv or a JSON file. The main difference is that this
script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like.
It offers less options than the script with `Trainer` (but you can easily change the options for the optimizer
or the dataloaders directly in the script) but still run in a distributed setup, on TPU and supports mixed precision by
the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
after installing it:
```bash
pip install accelerate
```
then
```bash
export DATASET_NAME=swag
python run_swag_no_trainer.py \
--model_name_or_path bert-base-cased \
--dataset_name $DATASET_NAME \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--output_dir /tmp/$DATASET_NAME/
```
You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
```bash
accelerate config
```
and reply to the questions asked. Then
```bash
accelerate test
```
that will check everything is ready for training. Finally, you can launch training with
```bash
export DATASET_NAME=swag
accelerate launch run_swag_no_trainer.py \
--model_name_or_path bert-base-cased \
--dataset_name $DATASET_NAME \
--max_seq_length 128 \
--per_device_train_batch_size 32 \
--learning_rate 2e-5 \
--num_train_epochs 3 \
--output_dir /tmp/$DATASET_NAME/
```
This command is the same and will work for:
- a CPU-only setup
- a setup with one GPU
- a distributed training with several GPUs (single or multi node)
- a training on TPUs
Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.
## Tensorflow
@@ -129,3 +60,6 @@ python ./examples/multiple-choice/run_tf_multiple_choice.py \
--gradient_accumulation_steps 2 \
--overwrite_output
```
# Run it in colab
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)

View File

@@ -1,19 +0,0 @@
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
accelerate launch run_swag_no_trainer.py \
--model_name_or_path bert-base-uncased \
--dataset_name swag \
--output_dir /tmp/test-swag-no-trainer \
--pad_to_max_length

159
examples/multiple-choice/run_swag.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
@@ -39,15 +38,10 @@ from transformers import (
default_data_collator,
set_seed,
)
from transformers.file_utils import PaddingStrategy
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase
from transformers.trainer_utils import is_main_process
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.5.0")
logger = logging.getLogger(__name__)
@@ -120,20 +114,6 @@ class DataTrainingArguments:
"efficient on GPU but very bad for TPU."
},
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_val_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
},
)
def __post_init__(self):
if self.train_file is not None:
@@ -152,7 +132,7 @@ class DataCollatorForMultipleChoice:
Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
@@ -214,28 +194,23 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty."
"Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
# Log on each process the small summary:
logger.warning(
@@ -245,9 +220,7 @@ def main():
# Set the verbosity to info of the Transformers logger (on main process only):
if is_main_process(training_args.local_rank):
transformers.utils.logging.set_verbosity_info()
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
logger.info(f"Training/evaluation parameters {training_args}")
logger.info("Training/evaluation parameters %s", training_args)
# Set seed before initializing model.
set_seed(training_args.seed)
@@ -307,22 +280,6 @@ def main():
context_name = "sent1"
question_header_name = "sent2"
if data_args.max_seq_length is None:
max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024:
logger.warn(
f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
"Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
)
max_seq_length = 1024
else:
if data_args.max_seq_length > tokenizer.model_max_length:
logger.warn(
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
)
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
# Preprocessing the datasets.
def preprocess_function(examples):
first_sentences = [[context] * 4 for context in examples[context_name]]
@@ -340,43 +297,22 @@ def main():
first_sentences,
second_sentences,
truncation=True,
max_length=max_seq_length,
max_length=data_args.max_seq_length,
padding="max_length" if data_args.pad_to_max_length else False,
)
# Un-flatten
return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
if training_args.do_train:
train_dataset = datasets["train"]
if "train" not in datasets:
raise ValueError("--do_train requires a train dataset")
if data_args.max_train_samples is not None:
train_dataset = train_dataset.select(range(data_args.max_train_samples))
train_dataset = train_dataset.map(
preprocess_function,
batched=True,
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache,
)
if training_args.do_eval:
if "validation" not in datasets:
raise ValueError("--do_eval requires a validation dataset")
eval_dataset = datasets["validation"]
if data_args.max_val_samples is not None:
eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
eval_dataset = eval_dataset.map(
preprocess_function,
batched=True,
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache,
)
tokenized_datasets = datasets.map(
preprocess_function,
batched=True,
num_proc=data_args.preprocessing_num_workers,
load_from_cache_file=not data_args.overwrite_cache,
)
# Data collator
data_collator = (
default_data_collator
if data_args.pad_to_max_length
else DataCollatorForMultipleChoice(tokenizer=tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
default_data_collator if data_args.pad_to_max_length else DataCollatorForMultipleChoice(tokenizer=tokenizer)
)
# Metric
@@ -389,8 +325,8 @@ def main():
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
@@ -398,35 +334,38 @@ def main():
# Training
if training_args.do_train:
if last_checkpoint is not None:
checkpoint = last_checkpoint
elif os.path.isdir(model_args.model_name_or_path):
checkpoint = model_args.model_name_or_path
else:
checkpoint = None
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model() # Saves the tokenizer too for easy upload
metrics = train_result.metrics
max_train_samples = (
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
train_result = trainer.train(
model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.save_model() # Saves the tokenizer too for easy upload
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
if trainer.is_world_process_zero():
with open(output_train_file, "w") as writer:
logger.info("***** Train results *****")
for key, value in sorted(train_result.metrics.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
# Need to save the state, since Trainer.save_model saves only the tokenizer with the model
trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
# Evaluation
results = {}
if training_args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate()
max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
results = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
output_eval_file = os.path.join(training_args.output_dir, "eval_results_swag.txt")
if trainer.is_world_process_zero():
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key, value in sorted(results.items()):
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
return results
def _mp_fn(index):

View File

@@ -1,488 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning a 🤗 Transformers model on multiple choice relying on the accelerate library without using a Trainer.
"""
# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
import argparse
import logging
import math
import os
import random
from dataclasses import dataclass
from typing import Optional, Union
import datasets
import torch
from datasets import load_dataset, load_metric
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator
from transformers import (
CONFIG_MAPPING,
MODEL_MAPPING,
AdamW,
AutoConfig,
AutoModelForMultipleChoice,
AutoTokenizer,
PreTrainedTokenizerBase,
SchedulerType,
default_data_collator,
get_scheduler,
set_seed,
)
from transformers.file_utils import PaddingStrategy
logger = logging.getLogger(__name__)
# You should update this to your particular problem to have better documentation of `model_type`
MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def parse_args():
parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
parser.add_argument(
"--dataset_name",
type=str,
default=None,
help="The name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--dataset_config_name",
type=str,
default=None,
help="The configuration name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--train_file", type=str, default=None, help="A csv or a json file containing the training data."
)
parser.add_argument(
"--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
)
parser.add_argument(
"--max_length",
type=int,
default=128,
help=(
"The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
" sequences shorter will be padded if `--pad_to_max_lengh` is passed."
),
)
parser.add_argument(
"--pad_to_max_length",
action="store_true",
help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
)
parser.add_argument(
"--model_name_or_path",
type=str,
help="Path to pretrained model or model identifier from huggingface.co/models.",
required=True,
)
parser.add_argument(
"--config_name",
type=str,
default=None,
help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
"--tokenizer_name",
type=str,
default=None,
help="Pretrained tokenizer name or path if not the same as model_name",
)
parser.add_argument(
"--use_slow_tokenizer",
action="store_true",
help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
)
parser.add_argument(
"--per_device_train_batch_size",
type=int,
default=8,
help="Batch size (per device) for the training dataloader.",
)
parser.add_argument(
"--per_device_eval_batch_size",
type=int,
default=8,
help="Batch size (per device) for the evaluation dataloader.",
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use.",
)
parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
parser.add_argument(
"--max_train_steps",
type=int,
default=None,
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
)
parser.add_argument(
"--gradient_accumulation_steps",
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
"--lr_scheduler_type",
type=SchedulerType,
default="linear",
help="The scheduler type to use.",
choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
)
parser.add_argument(
"--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
)
parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
parser.add_argument(
"--model_type",
type=str,
default=None,
help="Model type to use if training from scratch.",
choices=MODEL_TYPES,
)
parser.add_argument(
"--debug",
action="store_true",
help="Activate debug mode and run training only with a subset of data.",
)
args = parser.parse_args()
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
@dataclass
class DataCollatorForMultipleChoice:
"""
Data collator that will dynamically pad the inputs for multiple choice received.
Args:
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
The tokenizer used for encoding the data.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
7.5 (Volta).
"""
tokenizer: PreTrainedTokenizerBase
padding: Union[bool, str, PaddingStrategy] = True
max_length: Optional[int] = None
pad_to_multiple_of: Optional[int] = None
def __call__(self, features):
label_name = "label" if "label" in features[0].keys() else "labels"
labels = [feature.pop(label_name) for feature in features]
batch_size = len(features)
num_choices = len(features[0]["input_ids"])
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
batch = self.tokenizer.pad(
flattened_features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors="pt",
)
# Un-flatten
batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
# Add back labels
batch["labels"] = torch.tensor(labels, dtype=torch.int64)
return batch
def main():
args = parse_args()
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()
# Make one log on every process with the configuration for debugging.
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.info(accelerator.state)
# Setup logging, we only want one process per machine to log things on the screen.
# accelerator.is_local_main_process is only True for one process per machine.
logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
if accelerator.is_local_main_process:
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
else:
datasets.utils.logging.set_verbosity_error()
transformers.utils.logging.set_verbosity_error()
# If passed along, set the training seed now.
if args.seed is not None:
set_seed(args.seed)
# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
# 'text' is found. You can easily tweak this behavior (see below).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
else:
data_files = {}
if args.train_file is not None:
data_files["train"] = args.train_file
if args.validation_file is not None:
data_files["validation"] = args.validation_file
extension = args.train_file.split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files)
# Trim a number of training examples
if args.debug:
for split in raw_datasets.keys():
raw_datasets[split] = raw_datasets[split].select(range(100))
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
if raw_datasets["train"] is not None:
column_names = raw_datasets["train"].column_names
else:
column_names = raw_datasets["validation"].column_names
# When using your own dataset or a different dataset from swag, you will probably need to change this.
ending_names = [f"ending{i}" for i in range(4)]
context_name = "sent1"
question_header_name = "sent2"
label_column_name = "label" if "label" in column_names else "labels"
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
if args.config_name:
config = AutoConfig.from_pretrained(args.model_name_or_path)
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
else:
config = CONFIG_MAPPING[args.model_type]()
logger.warning("You are instantiating a new config instance from scratch.")
if args.tokenizer_name:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
elif args.model_name_or_path:
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
)
if args.model_name_or_path:
model = AutoModelForMultipleChoice.from_pretrained(
args.model_name_or_path,
from_tf=bool(".ckpt" in args.model_name_or_path),
config=config,
)
else:
logger.info("Training new model from scratch")
model = AutoModelForMultipleChoice.from_config(config)
model.resize_token_embeddings(len(tokenizer))
# Preprocessing the datasets.
# First we tokenize all the texts.
padding = "max_length" if args.pad_to_max_length else False
def preprocess_function(examples):
first_sentences = [[context] * 4 for context in examples[context_name]]
question_headers = examples[question_header_name]
second_sentences = [
[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
]
labels = examples[label_column_name]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
# Tokenize
tokenized_examples = tokenizer(
first_sentences,
second_sentences,
max_length=args.max_length,
padding=padding,
truncation=True,
)
# Un-flatten
tokenized_inputs = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
tokenized_inputs["labels"] = labels
return tokenized_inputs
processed_datasets = raw_datasets.map(
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
# DataLoaders creation:
if args.pad_to_max_length:
# If padding was already done ot max length, we use the default data collator that will just convert everything
# to tensors.
data_collator = default_data_collator
else:
# Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
# the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
# of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
data_collator = DataCollatorForMultipleChoice(
tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
)
train_dataloader = DataLoader(
train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
# Use the device given by the `accelerator` object.
device = accelerator.device
model.to(device)
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, eval_dataloader
)
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)
# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
lr_scheduler = get_scheduler(
name=args.lr_scheduler_type,
optimizer=optimizer,
num_warmup_steps=args.num_warmup_steps,
num_training_steps=args.max_train_steps,
)
# Metrics
metric = load_metric("accuracy")
# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
logger.info("***** Running training *****")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Num Epochs = {args.num_train_epochs}")
logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}")
logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
logger.info(f" Total optimization steps = {args.max_train_steps}")
# Only show the progress bar once on each machine.
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
for epoch in range(args.num_train_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
loss = loss / args.gradient_accumulation_steps
accelerator.backward(loss)
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
completed_steps += 1
if completed_steps >= args.max_train_steps:
break
model.eval()
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1)
metric.add_batch(
predictions=accelerator.gather(predictions),
references=accelerator.gather(batch["labels"]),
)
eval_metric = metric.compute()
accelerator.print(f"epoch {epoch}: {eval_metric}")
if args.output_dir is not None:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
if __name__ == "__main__":
main()

15
examples/multiple-choice/run_tf_multiple_choice.py Executable file → Normal file
View File

@@ -1,4 +1,3 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
@@ -116,10 +115,12 @@ def main():
level=logging.INFO,
)
logger.warning(
f"device: {training_args.device}, n_replicas: {training_args.n_replicas}, "
f"16-bits training: {training_args.fp16}"
"device: %s, n_replicas: %s, 16-bits training: %s",
training_args.device,
training_args.n_replicas,
training_args.fp16,
)
logger.info(f"Training/evaluation parameters {training_args}")
logger.info("Training/evaluation parameters %s", training_args)
# Set seed
set_seed(training_args.seed)
@@ -129,7 +130,7 @@ def main():
label_list = processor.get_labels()
num_labels = len(label_list)
except KeyError:
raise ValueError(f"Task not found: {data_args.task_name}")
raise ValueError("Task not found: %s" % (data_args.task_name))
# Load pretrained model and tokenizer
#
@@ -208,8 +209,8 @@ def main():
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results *****")
for key, value in result.items():
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
logger.info(" %s = %s", key, value)
writer.write("%s = %s\n" % (key, value))
results.update(result)

View File

@@ -99,7 +99,13 @@ if is_torch_available():
processor = processors[task]()
cached_features_file = os.path.join(
data_dir, f"cached_{mode.value}_{tokenizer.__class__.__name__}_{max_seq_length}_{task}"
data_dir,
"cached_{}_{}_{}_{}".format(
mode.value,
tokenizer.__class__.__name__,
str(max_seq_length),
task,
),
)
# Make sure only the first process in distributed training processes the dataset,
@@ -119,14 +125,14 @@ if is_torch_available():
examples = processor.get_test_examples(data_dir)
else:
examples = processor.get_train_examples(data_dir)
logger.info(f"Training examples: {len(examples)}")
logger.info("Training examples: %s", len(examples))
self.features = convert_examples_to_features(
examples,
label_list,
max_seq_length,
tokenizer,
)
logger.info(f"Saving features into cached file {cached_features_file}")
logger.info("Saving features into cached file %s", cached_features_file)
torch.save(self.features, cached_features_file)
def __len__(self):
@@ -166,7 +172,7 @@ if is_tf_available():
examples = processor.get_test_examples(data_dir)
else:
examples = processor.get_train_examples(data_dir)
logger.info(f"Training examples: {len(examples)}")
logger.info("Training examples: %s", len(examples))
self.features = convert_examples_to_features(
examples,
@@ -178,7 +184,7 @@ if is_tf_available():
def gen():
for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
if ex_index % 10000 == 0:
logger.info(f"Writing example {ex_index} of {len(examples)}")
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
yield (
{
@@ -249,7 +255,7 @@ class RaceProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} train")
logger.info("LOOKING AT {} train".format(data_dir))
high = os.path.join(data_dir, "train/high")
middle = os.path.join(data_dir, "train/middle")
high = self._read_txt(high)
@@ -258,7 +264,7 @@ class RaceProcessor(DataProcessor):
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} dev")
logger.info("LOOKING AT {} dev".format(data_dir))
high = os.path.join(data_dir, "dev/high")
middle = os.path.join(data_dir, "dev/middle")
high = self._read_txt(high)
@@ -267,7 +273,7 @@ class RaceProcessor(DataProcessor):
def get_test_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} test")
logger.info("LOOKING AT {} test".format(data_dir))
high = os.path.join(data_dir, "test/high")
middle = os.path.join(data_dir, "test/middle")
high = self._read_txt(high)
@@ -292,7 +298,7 @@ class RaceProcessor(DataProcessor):
"""Creates examples for the training and dev sets."""
examples = []
for (_, data_raw) in enumerate(lines):
race_id = f"{set_type}-{data_raw['race_id']}"
race_id = "%s-%s" % (set_type, data_raw["race_id"])
article = data_raw["article"]
for i in range(len(data_raw["answers"])):
truth = str(ord(data_raw["answers"][i]) - ord("A"))
@@ -316,17 +322,17 @@ class SynonymProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} train")
logger.info("LOOKING AT {} train".format(data_dir))
return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} dev")
logger.info("LOOKING AT {} dev".format(data_dir))
return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} dev")
logger.info("LOOKING AT {} dev".format(data_dir))
return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
@@ -362,17 +368,17 @@ class SwagProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} train")
logger.info("LOOKING AT {} train".format(data_dir))
return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} dev")
logger.info("LOOKING AT {} dev".format(data_dir))
return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} dev")
logger.info("LOOKING AT {} dev".format(data_dir))
raise ValueError(
"For swag testing, the input file does not contain a label column. It can not be tested in current code"
"setting!"
@@ -413,16 +419,16 @@ class ArcProcessor(DataProcessor):
def get_train_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} train")
logger.info("LOOKING AT {} train".format(data_dir))
return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info(f"LOOKING AT {data_dir} dev")
logger.info("LOOKING AT {} dev".format(data_dir))
return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
def get_test_examples(self, data_dir):
logger.info(f"LOOKING AT {data_dir} test")
logger.info("LOOKING AT {} test".format(data_dir))
return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
def get_labels(self):
@@ -444,7 +450,7 @@ class ArcProcessor(DataProcessor):
elif truth in "1234":
return int(truth) - 1
else:
logger.info(f"truth ERROR! {truth}")
logger.info("truth ERROR! %s", str(truth))
return None
examples = []
@@ -490,11 +496,11 @@ class ArcProcessor(DataProcessor):
if type == "train":
assert len(examples) > 1
assert examples[0].label is not None
logger.info(f"len examples: {len(examples)}")
logger.info(f"Three choices: {three_choice}")
logger.info(f"Five choices: {five_choice}")
logger.info(f"Other choices: {other_choices}")
logger.info(f"four choices: {four_choice}")
logger.info("len examples: %s}", str(len(examples)))
logger.info("Three choices: %s", str(three_choice))
logger.info("Five choices: %s", str(five_choice))
logger.info("Other choices: %s", str(other_choices))
logger.info("four choices: %s", str(four_choice))
return examples
@@ -514,7 +520,7 @@ def convert_examples_to_features(
features = []
for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
if ex_index % 10000 == 0:
logger.info(f"Writing example {ex_index} of {len(examples)}")
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
choices_inputs = []
for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
text_a = context
@@ -564,7 +570,7 @@ def convert_examples_to_features(
for f in features[:2]:
logger.info("*** Example ***")
logger.info("feature: {f}")
logger.info("feature: %s" % f)
return features

Some files were not shown because too many files have changed in this diff Show More