Indent code block in the documentation (#11233)
* Indent code block * Indent code blocks version 2 * Quality
This commit is contained in:
@@ -33,38 +33,38 @@ Example of using a model with MeCab and WordPiece tokenization:
|
||||
|
||||
.. code-block::
|
||||
|
||||
>>> import torch
|
||||
>>> from transformers import AutoModel, AutoTokenizer
|
||||
>>> import torch
|
||||
>>> from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
|
||||
|
||||
>>> ## Input Japanese Text
|
||||
>>> line = "吾輩は猫である。"
|
||||
>>> ## Input Japanese Text
|
||||
>>> line = "吾輩は猫である。"
|
||||
|
||||
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||
|
||||
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||
[CLS] 吾輩 は 猫 で ある 。 [SEP]
|
||||
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||
[CLS] 吾輩 は 猫 で ある 。 [SEP]
|
||||
|
||||
>>> outputs = bertjapanese(**inputs)
|
||||
>>> outputs = bertjapanese(**inputs)
|
||||
|
||||
Example of using a model with Character tokenization:
|
||||
|
||||
.. code-block::
|
||||
|
||||
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||
>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
|
||||
|
||||
>>> ## Input Japanese Text
|
||||
>>> line = "吾輩は猫である。"
|
||||
>>> ## Input Japanese Text
|
||||
>>> line = "吾輩は猫である。"
|
||||
|
||||
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||
>>> inputs = tokenizer(line, return_tensors="pt")
|
||||
|
||||
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||
[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
|
||||
>>> print(tokenizer.decode(inputs['input_ids'][0]))
|
||||
[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
|
||||
|
||||
>>> outputs = bertjapanese(**inputs)
|
||||
>>> outputs = bertjapanese(**inputs)
|
||||
|
||||
Tips:
|
||||
|
||||
|
||||
@@ -38,22 +38,22 @@ Usage:
|
||||
|
||||
.. code-block::
|
||||
|
||||
# leverage checkpoints for Bert2Bert model...
|
||||
# use BERT's cls token as BOS token and sep token as EOS token
|
||||
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
|
||||
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
|
||||
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
|
||||
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
|
||||
# leverage checkpoints for Bert2Bert model...
|
||||
# use BERT's cls token as BOS token and sep token as EOS token
|
||||
encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
|
||||
# add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
|
||||
decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
|
||||
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
|
||||
|
||||
# create tokenizer...
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
|
||||
# create tokenizer...
|
||||
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
|
||||
|
||||
input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
|
||||
labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
|
||||
input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
|
||||
labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
|
||||
|
||||
# train...
|
||||
loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
|
||||
loss.backward()
|
||||
# train...
|
||||
loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
|
||||
loss.backward()
|
||||
|
||||
|
||||
- Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,
|
||||
@@ -61,15 +61,15 @@ Usage:
|
||||
|
||||
.. code-block::
|
||||
|
||||
# instantiate sentence fusion model
|
||||
sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
|
||||
# instantiate sentence fusion model
|
||||
sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
|
||||
|
||||
input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
|
||||
input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
|
||||
|
||||
outputs = sentence_fuser.generate(input_ids)
|
||||
outputs = sentence_fuser.generate(input_ids)
|
||||
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
print(tokenizer.decode(outputs[0]))
|
||||
|
||||
|
||||
Tips:
|
||||
|
||||
@@ -31,28 +31,28 @@ Example of use:
|
||||
|
||||
.. code-block::
|
||||
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
|
||||
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
|
||||
|
||||
# For transformers v4.x+:
|
||||
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
|
||||
# For transformers v4.x+:
|
||||
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
|
||||
|
||||
# For transformers v3.x:
|
||||
# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
|
||||
# For transformers v3.x:
|
||||
# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
|
||||
|
||||
# INPUT TWEET IS ALREADY NORMALIZED!
|
||||
line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
|
||||
# INPUT TWEET IS ALREADY NORMALIZED!
|
||||
line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
|
||||
|
||||
input_ids = torch.tensor([tokenizer.encode(line)])
|
||||
input_ids = torch.tensor([tokenizer.encode(line)])
|
||||
|
||||
with torch.no_grad():
|
||||
features = bertweet(input_ids) # Models outputs are now tuples
|
||||
with torch.no_grad():
|
||||
features = bertweet(input_ids) # Models outputs are now tuples
|
||||
|
||||
## With TensorFlow 2.0+:
|
||||
# from transformers import TFAutoModel
|
||||
# bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
|
||||
## With TensorFlow 2.0+:
|
||||
# from transformers import TFAutoModel
|
||||
# bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
|
||||
|
||||
|
||||
The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
|
||||
|
||||
@@ -40,20 +40,20 @@ Examples of use:
|
||||
|
||||
.. code-block::
|
||||
|
||||
from transformers import HerbertTokenizer, RobertaModel
|
||||
from transformers import HerbertTokenizer, RobertaModel
|
||||
|
||||
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
|
||||
model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
|
||||
tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
|
||||
model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
|
||||
|
||||
encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
|
||||
outputs = model(encoded_input)
|
||||
encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
|
||||
outputs = model(encoded_input)
|
||||
|
||||
# HerBERT can also be loaded using AutoTokenizer and AutoModel:
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
# HerBERT can also be loaded using AutoTokenizer and AutoModel:
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
|
||||
model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
|
||||
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
|
||||
model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
|
||||
|
||||
|
||||
The original code can be found `here <https://github.com/allegro/HerBERT>`__.
|
||||
|
||||
@@ -56,24 +56,24 @@ Tips:
|
||||
|
||||
.. code-block::
|
||||
|
||||
def normalize_bbox(bbox, width, height):
|
||||
return [
|
||||
int(1000 * (bbox[0] / width)),
|
||||
int(1000 * (bbox[1] / height)),
|
||||
int(1000 * (bbox[2] / width)),
|
||||
int(1000 * (bbox[3] / height)),
|
||||
]
|
||||
def normalize_bbox(bbox, width, height):
|
||||
return [
|
||||
int(1000 * (bbox[0] / width)),
|
||||
int(1000 * (bbox[1] / height)),
|
||||
int(1000 * (bbox[2] / width)),
|
||||
int(1000 * (bbox[3] / height)),
|
||||
]
|
||||
|
||||
Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
|
||||
occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:
|
||||
|
||||
.. code-block::
|
||||
|
||||
from PIL import Image
|
||||
from PIL import Image
|
||||
|
||||
image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
|
||||
image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
|
||||
|
||||
width, height = image.size
|
||||
width, height = image.size
|
||||
|
||||
- For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
|
||||
<https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
|
||||
|
||||
@@ -53,15 +53,15 @@ BERT-345M-uncased::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
|
||||
-O megatron_bert_345m_v0_1_uncased.zip
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
|
||||
-O megatron_bert_345m_v0_1_uncased.zip
|
||||
|
||||
BERT-345M-cased::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
|
||||
megatron_bert_345m_v0_1_cased.zip
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
|
||||
megatron_bert_345m_v0_1_cased.zip
|
||||
|
||||
Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
|
||||
easily be loaded by Hugging Face Transformers and our port of the BERT code.
|
||||
@@ -71,11 +71,11 @@ The following commands allow you to do the conversion. We assume that the folder
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
|
||||
|
||||
The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
|
||||
and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
|
||||
|
||||
@@ -51,8 +51,8 @@ Alternatively, you can directly download the checkpoints using::
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
|
||||
megatron_gpt2_345m_v0_0.zip
|
||||
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
|
||||
megatron_gpt2_345m_v0_0.zip
|
||||
|
||||
Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
|
||||
be loaded by Hugging Face Transformers GPT2 implementation.
|
||||
@@ -62,7 +62,7 @@ The following command allows you to do the conversion. We assume that the folder
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
|
||||
python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
|
||||
|
||||
The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
|
||||
and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
|
||||
|
||||
@@ -31,23 +31,23 @@ Example of use:
|
||||
|
||||
.. code-block::
|
||||
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
|
||||
phobert = AutoModel.from_pretrained("vinai/phobert-base")
|
||||
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
|
||||
phobert = AutoModel.from_pretrained("vinai/phobert-base")
|
||||
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
|
||||
|
||||
# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
|
||||
line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
|
||||
# INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
|
||||
line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
|
||||
|
||||
input_ids = torch.tensor([tokenizer.encode(line)])
|
||||
input_ids = torch.tensor([tokenizer.encode(line)])
|
||||
|
||||
with torch.no_grad():
|
||||
features = phobert(input_ids) # Models outputs are now tuples
|
||||
with torch.no_grad():
|
||||
features = phobert(input_ids) # Models outputs are now tuples
|
||||
|
||||
## With TensorFlow 2.0+:
|
||||
# from transformers import TFAutoModel
|
||||
# phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
|
||||
## With TensorFlow 2.0+:
|
||||
# from transformers import TFAutoModel
|
||||
# phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
|
||||
|
||||
|
||||
The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
|
||||
|
||||
@@ -145,8 +145,8 @@ For training, the :class:`~transformers.ReformerModelWithLMHead` should be used
|
||||
|
||||
.. code-block::
|
||||
|
||||
input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
|
||||
loss = model(input_ids, labels=input_ids)[0]
|
||||
input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
|
||||
loss = model(input_ids, labels=input_ids)[0]
|
||||
|
||||
|
||||
ReformerConfig
|
||||
|
||||
@@ -73,10 +73,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
|
||||
|
||||
.. code-block::
|
||||
|
||||
input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
|
||||
labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
|
||||
# the forward function automatically creates the correct decoder_input_ids
|
||||
loss = model(input_ids=input_ids, labels=labels).loss
|
||||
input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
|
||||
labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
|
||||
# the forward function automatically creates the correct decoder_input_ids
|
||||
loss = model(input_ids=input_ids, labels=labels).loss
|
||||
|
||||
- Supervised training
|
||||
|
||||
@@ -86,10 +86,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
|
||||
|
||||
.. code-block::
|
||||
|
||||
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
|
||||
labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
|
||||
# the forward function automatically creates the correct decoder_input_ids
|
||||
loss = model(input_ids=input_ids, labels=labels).loss
|
||||
input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
|
||||
labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
|
||||
# the forward function automatically creates the correct decoder_input_ids
|
||||
loss = model(input_ids=input_ids, labels=labels).loss
|
||||
|
||||
|
||||
T5Config
|
||||
|
||||
Reference in New Issue
Block a user