@@ -56,6 +56,8 @@ PreTrainedTokenizer
|
|||||||
:special-members: __call__
|
:special-members: __call__
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
.. automethod:: encode
|
||||||
|
|
||||||
|
|
||||||
PreTrainedTokenizerFast
|
PreTrainedTokenizerFast
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
@@ -64,6 +66,8 @@ PreTrainedTokenizerFast
|
|||||||
:special-members: __call__
|
:special-members: __call__
|
||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
.. automethod:: encode
|
||||||
|
|
||||||
|
|
||||||
BatchEncoding
|
BatchEncoding
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|||||||
@@ -364,28 +364,35 @@ DPR_ENCODERS_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
|
Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
|
||||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||||
details. attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`,
|
details.
|
||||||
`optional`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0,
|
|
||||||
1]``:
|
`What are input IDs? <../glossary.html#input-ids>`__
|
||||||
|
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||||
|
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
- 1 for tokens that are **not masked**,
|
||||||
- 0 for tokens that are **masked**.
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
`What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`torch.LongTensor` of
|
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||||
shape :obj:`(batch_size, sequence_length)`, `optional`): Segment token indices to indicate first and second
|
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||||
portions of the inputs. Indices are selected in ``[0, 1]``:
|
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
|
||||||
|
1]``:
|
||||||
|
|
||||||
- 0 corresponds to a `sentence A` token,
|
- 0 corresponds to a `sentence A` token,
|
||||||
- 1 corresponds to a `sentence B` token.
|
- 1 corresponds to a `sentence B` token.
|
||||||
|
|
||||||
`What are token type IDs? <../glossary.html#token-type-ids>`_ inputs_embeds (:obj:`torch.FloatTensor` of
|
`What are token type IDs? <../glossary.html#token-type-ids>`_
|
||||||
shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Optionally, instead of passing
|
inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
||||||
:obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want
|
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
|
||||||
more control over how to convert :obj:`input_ids` indices into associated vectors than the model's internal
|
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
|
||||||
embedding lookup matrix. output_attentions (:obj:`bool`, `optional`): Whether or not to return the
|
vectors than the model's internal embedding lookup matrix.
|
||||||
attentions tensors of all attention layers. See ``attentions`` under returned tensors for more detail.
|
output_attentions (:obj:`bool`, `optional`):
|
||||||
output_hidden_states (:obj:`bool`, `optional`): Whether or not to return the hidden states of all layers.
|
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||||
See ``hidden_states`` under returned tensors for more detail. return_dict (:obj:`bool`, `optional`):
|
tensors for more detail.
|
||||||
|
output_hidden_states (:obj:`bool`, `optional`):
|
||||||
|
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||||
|
more detail.
|
||||||
|
return_dict (:obj:`bool`, `optional`):
|
||||||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@@ -403,6 +410,8 @@ DPR_READER_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
Indices can be obtained using :class:`~transformers.DPRReaderTokenizer`. See this class documentation for
|
Indices can be obtained using :class:`~transformers.DPRReaderTokenizer`. See this class documentation for
|
||||||
more details.
|
more details.
|
||||||
|
|
||||||
|
`What are input IDs? <../glossary.html#input-ids>`__
|
||||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(n_passages, sequence_length)`, `optional`):
|
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(n_passages, sequence_length)`, `optional`):
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||||
|
|
||||||
|
|||||||
@@ -486,15 +486,17 @@ TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
(a) For sequence pairs (for a pair title+text for example):
|
(a) For sequence pairs (for a pair title+text for example):
|
||||||
|
|
||||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
::
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
|
||||||
|
token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
|
||||||
|
|
||||||
(b) For single sequences (for a question for example):
|
(b) For single sequences (for a question for example):
|
||||||
|
|
||||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
::
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0``
|
tokens: [CLS] the dog is hairy . [SEP]
|
||||||
|
token_type_ids: 0 0 0 0 0 0 0
|
||||||
|
|
||||||
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
|
DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
|
||||||
rather than the left.
|
rather than the left.
|
||||||
@@ -502,6 +504,8 @@ TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
|
|||||||
Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
|
Indices can be obtained using :class:`~transformers.DPRTokenizer`. See
|
||||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||||
details.
|
details.
|
||||||
|
|
||||||
|
`What are input IDs? <../glossary.html#input-ids>`__
|
||||||
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||||
|
|
||||||
|
|||||||
@@ -412,6 +412,8 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
|
|||||||
Indices of input sequence tokens in the vocabulary. :class:`~transformers.RagConfig`, used to initialize
|
Indices of input sequence tokens in the vocabulary. :class:`~transformers.RagConfig`, used to initialize
|
||||||
the model, specifies which generator to use, it also specifies a compatible generator tokenizer. Use that
|
the model, specifies which generator to use, it also specifies a compatible generator tokenizer. Use that
|
||||||
tokenizer class to obtain the indices.
|
tokenizer class to obtain the indices.
|
||||||
|
|
||||||
|
`What are input IDs? <../glossary.html#input-ids>`__
|
||||||
attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||||
|
|
||||||
|
|||||||
@@ -1041,6 +1041,8 @@ T5_INPUTS_DOCSTRING = r"""
|
|||||||
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
:meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
|
||||||
detail.
|
detail.
|
||||||
|
|
||||||
|
`What are input IDs? <../glossary.html#input-ids>`__
|
||||||
|
|
||||||
To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
|
To know more on how to prepare :obj:`input_ids` for pretraining take a look a `T5 Training
|
||||||
<./t5.html#training>`__.
|
<./t5.html#training>`__.
|
||||||
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||||
|
|||||||
@@ -929,7 +929,7 @@ T5_START_DOCSTRING = r"""
|
|||||||
|
|
||||||
T5_INPUTS_DOCSTRING = r"""
|
T5_INPUTS_DOCSTRING = r"""
|
||||||
Args:
|
Args:
|
||||||
inputs (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
|
input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
|
||||||
Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
|
Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
|
||||||
should be able to pad the inputs on the right or the left.
|
should be able to pad the inputs on the right or the left.
|
||||||
|
|
||||||
@@ -937,6 +937,8 @@ T5_INPUTS_DOCSTRING = r"""
|
|||||||
:func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
|
:func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
|
||||||
details.
|
details.
|
||||||
|
|
||||||
|
`What are input IDs? <../glossary.html#input-ids>`__
|
||||||
|
|
||||||
To know more on how to prepare :obj:`inputs` for pretraining take a look at `T5 Training
|
To know more on how to prepare :obj:`inputs` for pretraining take a look at `T5 Training
|
||||||
<./t5.html#training>`__.
|
<./t5.html#training>`__.
|
||||||
decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||||
|
|||||||
@@ -135,6 +135,14 @@ class CodeStyler:
|
|||||||
"""
|
"""
|
||||||
return SpecialBlock.NOT_SPECIAL
|
return SpecialBlock.NOT_SPECIAL
|
||||||
|
|
||||||
|
def end_of_special_style(self, line):
|
||||||
|
"""
|
||||||
|
Sets back the `in_block` attribute to `NOT_SPECIAL`.
|
||||||
|
|
||||||
|
Useful for some docstrings where we may have to go back to `ARG_LIST` instead.
|
||||||
|
"""
|
||||||
|
self.in_block = SpecialBlock.NOT_SPECIAL
|
||||||
|
|
||||||
def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None):
|
def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None):
|
||||||
"""
|
"""
|
||||||
Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag
|
Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag
|
||||||
@@ -220,6 +228,7 @@ class CodeStyler:
|
|||||||
new_lines = []
|
new_lines = []
|
||||||
paragraph = []
|
paragraph = []
|
||||||
self.current_indent = ""
|
self.current_indent = ""
|
||||||
|
self.previous_indent = None
|
||||||
# If one of those is True, the paragraph should not be touched (code samples, lists...)
|
# If one of those is True, the paragraph should not be touched (code samples, lists...)
|
||||||
no_style = False
|
no_style = False
|
||||||
no_style_next = False
|
no_style_next = False
|
||||||
@@ -251,7 +260,7 @@ class CodeStyler:
|
|||||||
self.current_indent = indent
|
self.current_indent = indent
|
||||||
elif not indent.startswith(self.current_indent):
|
elif not indent.startswith(self.current_indent):
|
||||||
# If not, we are leaving the block when we unindent.
|
# If not, we are leaving the block when we unindent.
|
||||||
self.in_block = SpecialBlock.NOT_SPECIAL
|
self.end_of_special_style(paragraph[0])
|
||||||
|
|
||||||
if self.is_special_block(paragraph[0]):
|
if self.is_special_block(paragraph[0]):
|
||||||
# Maybe we are starting a special block.
|
# Maybe we are starting a special block.
|
||||||
@@ -326,6 +335,8 @@ class DocstringStyler(CodeStyler):
|
|||||||
|
|
||||||
def is_special_block(self, line):
|
def is_special_block(self, line):
|
||||||
if self.is_no_style_block(line):
|
if self.is_no_style_block(line):
|
||||||
|
if self.previous_indent is None and self.in_block == SpecialBlock.ARG_LIST:
|
||||||
|
self.previous_indent = self.current_indent
|
||||||
self.in_block = SpecialBlock.NO_STYLE
|
self.in_block = SpecialBlock.NO_STYLE
|
||||||
return True
|
return True
|
||||||
if _re_arg_def.search(line) is not None:
|
if _re_arg_def.search(line) is not None:
|
||||||
@@ -333,6 +344,14 @@ class DocstringStyler(CodeStyler):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def end_of_special_style(self, line):
|
||||||
|
if self.previous_indent is not None and line.startswith(self.previous_indent):
|
||||||
|
self.in_block = SpecialBlock.ARG_LIST
|
||||||
|
self.current_indent = self.previous_indent
|
||||||
|
else:
|
||||||
|
self.in_block = SpecialBlock.NOT_SPECIAL
|
||||||
|
self.previous_indent = None
|
||||||
|
|
||||||
def init_in_block(self, text):
|
def init_in_block(self, text):
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
while len(lines) > 0 and len(lines[0]) == 0:
|
while len(lines) > 0 and len(lines[0]) == 0:
|
||||||
|
|||||||
Reference in New Issue
Block a user