Fix typos in comments (#37694)
Signed-off-by: co63oc <co63oc@users.noreply.github.com>
This commit is contained in:
@@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):
|
||||
|
||||
model = benchmark.config.backend["model"]
|
||||
|
||||
# Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
|
||||
# This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
|
||||
# (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
|
||||
benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
|
||||
benchmark_name = str(Path(benchmark_name).parts[-1])
|
||||
|
||||
@@ -293,7 +293,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
|
||||
max_cache_len=seq_length + 128,
|
||||
)
|
||||
|
||||
# 3nd call
|
||||
# 3rd call
|
||||
start = perf_counter()
|
||||
output = model.generate(**inputs, past_key_values=past_key_values)
|
||||
end = perf_counter()
|
||||
|
||||
@@ -37,15 +37,15 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)
|
||||
|
||||
Args:
|
||||
audio (`str` or `np.ndarray`):
|
||||
The audio to be laoded to the numpy array format.
|
||||
The audio to be loaded to the numpy array format.
|
||||
sampling_rate (`int`, *optional*, defaults to 16000):
|
||||
The samlping rate to be used when loading the audio. It should be same as the
|
||||
The sampling rate to be used when loading the audio. It should be same as the
|
||||
sampling rate the model you will be using further was trained with.
|
||||
timeout (`float`, *optional*):
|
||||
The timeout value in seconds for the URL request.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`: A numpy artay representing the audio.
|
||||
`np.ndarray`: A numpy array representing the audio.
|
||||
"""
|
||||
requires_backends(load_audio, ["librosa"])
|
||||
|
||||
|
||||
@@ -1919,7 +1919,7 @@ class HybridChunkedCache(Cache):
|
||||
full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2)
|
||||
full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2)
|
||||
# Fast decoding path -> here as the effective size is still sliding window, it is extremely important
|
||||
# to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed adress
|
||||
# to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed address
|
||||
# in memory (the values are the same as the full states, but not the address!!)
|
||||
if key_states.shape[-2] == 1:
|
||||
self.key_cache[layer_idx].copy_(full_key_states)
|
||||
@@ -2031,7 +2031,7 @@ class OffloadedHybridCache(HybridChunkedCache):
|
||||
self.active_device_layer = 0
|
||||
|
||||
def initialise_cache_layer(self, layer_idx, key_states):
|
||||
"""Overriden to use the correct device if offloaded layer (and pin memory)."""
|
||||
"""Overridden to use the correct device if offloaded layer (and pin memory)."""
|
||||
if len(self.key_cache) > layer_idx:
|
||||
return
|
||||
|
||||
@@ -2243,7 +2243,7 @@ class OffloadedStaticCache(StaticCache):
|
||||
The device to offload to. Defaults to CPU.
|
||||
layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
|
||||
Mapping between the layers and its device. This is required when you are manually initializing the cache
|
||||
and the model is splitted between differents gpus. You can know which layers mapped to which device by
|
||||
and the model is split between different gpus. You can know which layers mapped to which device by
|
||||
checking the associated device_map: `model.hf_device_map`.
|
||||
|
||||
Example:
|
||||
|
||||
@@ -80,7 +80,7 @@ class DebugUnderflowOverflow:
|
||||
You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
|
||||
around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
|
||||
renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
|
||||
64K, and we get an overlow.
|
||||
64K, and we get an overflow.
|
||||
|
||||
As you can see it's the previous frames that we need to look into when the numbers start going into very large for
|
||||
fp16 numbers.
|
||||
|
||||
@@ -848,7 +848,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
|
||||
f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
|
||||
f"at '{resolved_archive_file}'. "
|
||||
"If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
|
||||
"by loading it in pytorch and saving it locally. A convertion script should be released soon."
|
||||
"by loading it in pytorch and saving it locally. A conversion script should be released soon."
|
||||
)
|
||||
|
||||
|
||||
@@ -980,10 +980,10 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
|
||||
for symbolic_weight in symbolic_weights:
|
||||
# TF names always start with the model name so we ignore it
|
||||
if _prefix is not None:
|
||||
delimeter = len(_prefix.split("/"))
|
||||
delimiter = len(_prefix.split("/"))
|
||||
symbolic_weight_name = "/".join(
|
||||
symbolic_weight.name.split("/")[:delimeter]
|
||||
+ symbolic_weight.name.split("/")[delimeter + 1 :]
|
||||
symbolic_weight.name.split("/")[:delimiter]
|
||||
+ symbolic_weight.name.split("/")[delimiter + 1 :]
|
||||
)
|
||||
else:
|
||||
symbolic_weight_name = "/".join(symbolic_weight.name.split("/")[1:])
|
||||
@@ -2042,7 +2042,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
|
||||
return model_embeds
|
||||
|
||||
def _get_word_embedding_weight(model, embedding_layer):
|
||||
# TODO (joao): flagged for delection due to embeddings refactor
|
||||
# TODO (joao): flagged for detection due to embeddings refactor
|
||||
|
||||
# If the variable holds the weights themselves, return them
|
||||
if isinstance(embedding_layer, tf.Tensor):
|
||||
@@ -3312,7 +3312,7 @@ class TFSharedEmbeddings(keras.layers.Layer):
|
||||
Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
|
||||
"""
|
||||
|
||||
# TODO (joao): flagged for delection due to embeddings refactor
|
||||
# TODO (joao): flagged for detection due to embeddings refactor
|
||||
|
||||
def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -856,7 +856,7 @@ def _get_resolved_checkpoint_files(
|
||||
) -> Tuple[Optional[List[str]], Optional[Dict]]:
|
||||
"""Get all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the
|
||||
checkpoints are sharded.
|
||||
This function will download the data if necesary.
|
||||
This function will download the data if necessary.
|
||||
"""
|
||||
is_sharded = False
|
||||
|
||||
@@ -3296,7 +3296,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
|
||||
save_peft_format (`bool`, *optional*, defaults to `True`):
|
||||
For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
|
||||
keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
|
||||
keys of the state dict of adapters needs to be prepended with `base_model.model`. Advanced users can
|
||||
disable this behaviours by setting `save_peft_format` to `False`.
|
||||
kwargs (`Dict[str, Any]`, *optional*):
|
||||
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
|
||||
@@ -3400,7 +3400,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMi
|
||||
|
||||
if save_peft_format:
|
||||
logger.info(
|
||||
"To match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`."
|
||||
"To match the expected format of the PEFT library, all keys of the state dict of adapters will be prepended with `base_model.model`."
|
||||
)
|
||||
peft_state_dict = {}
|
||||
for key, value in state_dict.items():
|
||||
@@ -5887,14 +5887,14 @@ def is_accelerator_device(device: Union[str, int, torch.device]) -> bool:
|
||||
def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, hf_quantizer: Optional[HfQuantizer]):
|
||||
"""This function warm-ups the caching allocator based on the size of the model tensors that will reside on each
|
||||
device. It allows to have one large call to Malloc, instead of recursively calling it later when loading
|
||||
the model, which is actually the loading speed botteneck.
|
||||
the model, which is actually the loading speed bottleneck.
|
||||
Calling this function allows to cut the model loading time by a very large margin.
|
||||
|
||||
A few facts related to loading speed (taking into account the use of this function):
|
||||
- When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely
|
||||
to cache the different state dicts (if enough ressources/RAM are available)
|
||||
to cache the different state dicts (if enough resources/RAM are available)
|
||||
- Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard,
|
||||
and not a good idea in general as this is low level OS optimizations that depend on ressource usage anyway
|
||||
and not a good idea in general as this is low level OS optimizations that depend on resource usage anyway
|
||||
- As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache.
|
||||
The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache.
|
||||
These numbers are reported for TP on 4 H100 GPUs.
|
||||
@@ -5935,7 +5935,7 @@ def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict,
|
||||
index = device.index if device.index is not None else torch.cuda.current_device()
|
||||
device_memory = torch.cuda.mem_get_info(index)[0]
|
||||
# Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more
|
||||
# than that amount might sometimes lead to unecesary cuda OOM, if the last parameter to be loaded on the device is large,
|
||||
# than that amount might sometimes lead to unnecessary cuda OOM, if the last parameter to be loaded on the device is large,
|
||||
# and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all
|
||||
# the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead
|
||||
# to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details.
|
||||
|
||||
@@ -56,24 +56,24 @@ class ConvNextImageProcessor(BaseImageProcessor):
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
|
||||
by `do_resize` in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 384}`):
|
||||
Resolution of the output image after `resize` is applied. If `size["shortest_edge"]` >= 384, the image is
|
||||
resized to `(size["shortest_edge"], size["shortest_edge"])`. Otherwise, the smaller edge of the image will
|
||||
be matched to `int(size["shortest_edge"]/crop_pct)`, after which the image is cropped to
|
||||
`(size["shortest_edge"], size["shortest_edge"])`. Only has an effect if `do_resize` is set to `True`. Can
|
||||
be overriden by `size` in the `preprocess` method.
|
||||
be overridden by `size` in the `preprocess` method.
|
||||
crop_pct (`float` *optional*, defaults to 224 / 256):
|
||||
Percentage of the image to crop. Only has an effect if `do_resize` is `True` and size < 384. Can be
|
||||
overriden by `crop_pct` in the `preprocess` method.
|
||||
overridden by `crop_pct` in the `preprocess` method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
|
||||
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
|
||||
@@ -91,7 +91,7 @@ class Data2VecAudioConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
Length of vector span along the time axis.
|
||||
@@ -102,7 +102,7 @@ class Data2VecAudioConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -79,7 +79,7 @@ class TvltImageProcessor(BaseImageProcessor):
|
||||
`do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
|
||||
Size of the output image after resizing. The shortest edge of the image will be resized to
|
||||
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
|
||||
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
|
||||
`size` in the `preprocess` method.
|
||||
patch_size (`List[int]` *optional*, defaults to [16,16]):
|
||||
The patch size of image patch embedding.
|
||||
|
||||
@@ -107,7 +107,7 @@ class HubertConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -119,7 +119,7 @@ class HubertConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -377,7 +377,7 @@ class IdeficsProcessor(ProcessorMixin):
|
||||
add_eos_token = output_kwargs["text_kwargs"].pop("add_eos_token", False)
|
||||
add_end_of_utterance_token = output_kwargs["text_kwargs"].pop("add_end_of_utterance_token", None)
|
||||
|
||||
# if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
|
||||
# if the value isn't overridden by the user, check if the tokenizer was trained with this token and then use it
|
||||
if add_end_of_utterance_token is None:
|
||||
add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
|
||||
# turn non-batched prompts into batched
|
||||
|
||||
@@ -66,7 +66,7 @@ class ImageGPTImageProcessor(BaseImageProcessor):
|
||||
|
||||
Args:
|
||||
clusters (`np.ndarray` or `List[List[int]]`, *optional*):
|
||||
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overriden by `clusters`
|
||||
The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
|
||||
in `preprocess`.
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
|
||||
|
||||
@@ -214,21 +214,21 @@ class Owlv2ImageProcessor(BaseImageProcessor):
|
||||
|
||||
Args:
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_pad (`bool`, *optional*, defaults to `True`):
|
||||
Whether to pad the image to a square with gray pixels on the bottom and the right. Can be overriden by
|
||||
Whether to pad the image to a square with gray pixels on the bottom and the right. Can be overridden by
|
||||
`do_pad` in the `preprocess` method.
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
|
||||
by `do_resize` in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"height": 960, "width": 960}`):
|
||||
Size to resize the image to. Can be overriden by `size` in the `preprocess` method.
|
||||
Size to resize the image to. Can be overridden by `size` in the `preprocess` method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling method to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
|
||||
Resampling method to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
method.
|
||||
|
||||
@@ -100,7 +100,7 @@ class SEWConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -112,7 +112,7 @@ class SEWConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -113,7 +113,7 @@ class SEWDConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -125,7 +125,7 @@ class SEWDConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -335,7 +335,7 @@ class SmolVLMProcessor(ProcessorMixin):
|
||||
Used within `apply_chat_template` when a model has special way to process conversation history. For example,
|
||||
video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
|
||||
were sampled. This information cannot be accessed before the video is loaded.
|
||||
For most models it is a no-op, must be overriden by model processors which require special processing.
|
||||
For most models it is a no-op, must be overridden by model processors which require special processing.
|
||||
Args:
|
||||
conversation (`List[Dict, str, str]`):
|
||||
The conversation to process. Always comes in batched format.
|
||||
|
||||
@@ -109,7 +109,7 @@ class SpeechT5Config(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -121,7 +121,7 @@ class SpeechT5Config(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -139,21 +139,21 @@ class SuperGlueImageProcessor(BaseImageProcessor):
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
|
||||
by `do_resize` in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
|
||||
Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to
|
||||
`True`. Can be overriden by `size` in the `preprocess` method.
|
||||
`True`. Can be overridden by `size` in the `preprocess` method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
|
||||
Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_grayscale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to grayscale. Can be overriden by `do_grayscale` in the `preprocess` method.
|
||||
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
@@ -102,19 +102,19 @@ class SuperPointImageProcessor(BaseImageProcessor):
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
|
||||
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden
|
||||
by `do_resize` in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
|
||||
Resolution of the output image after `resize` is applied. Only has an effect if `do_resize` is set to
|
||||
`True`. Can be overriden by `size` in the `preprocess` method.
|
||||
`True`. Can be overridden by `size` in the `preprocess` method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
|
||||
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
|
||||
the `preprocess` method.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_grayscale (`bool`, *optional*, defaults to `False`):
|
||||
Whether to convert the image to grayscale. Can be overriden by `do_grayscale` in the `preprocess` method.
|
||||
Whether to convert the image to grayscale. Can be overridden by `do_grayscale` in the `preprocess` method.
|
||||
"""
|
||||
|
||||
model_input_names = ["pixel_values"]
|
||||
|
||||
@@ -91,7 +91,7 @@ class TvpImageProcessor(BaseImageProcessor):
|
||||
`do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 448}`):
|
||||
Size of the output image after resizing. The longest edge of the image will be resized to
|
||||
`size["longest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
|
||||
`size["longest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
|
||||
`size` in the `preprocess` method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
|
||||
|
||||
@@ -106,7 +106,7 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -118,7 +118,7 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -107,7 +107,7 @@ class UniSpeechSatConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -119,7 +119,7 @@ class UniSpeechSatConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -72,7 +72,7 @@ class VideoMAEImageProcessor(BaseImageProcessor):
|
||||
`do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
|
||||
Size of the output image after resizing. The shortest edge of the image will be resized to
|
||||
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
|
||||
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
|
||||
`size` in the `preprocess` method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
|
||||
|
||||
@@ -334,11 +334,11 @@ class VitPoseImageProcessor(BaseImageProcessor):
|
||||
Whether to apply an affine transformation to the input images.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 192}`):
|
||||
Resolution of the image after `affine_transform` is applied. Only has an effect if `do_affine_transform` is set to `True`. Can
|
||||
be overriden by `size` in the `preprocess` method.
|
||||
be overridden by `size` in the `preprocess` method.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
|
||||
Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
|
||||
method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to normalize the input with mean and standard deviation.
|
||||
|
||||
@@ -73,7 +73,7 @@ class VivitImageProcessor(BaseImageProcessor):
|
||||
`do_resize` parameter in the `preprocess` method.
|
||||
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
|
||||
Size of the output image after resizing. The shortest edge of the image will be resized to
|
||||
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
|
||||
`size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
|
||||
`size` in the `preprocess` method.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
|
||||
Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
|
||||
@@ -91,7 +91,7 @@ class VivitImageProcessor(BaseImageProcessor):
|
||||
Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
|
||||
in the `preprocess` method.
|
||||
offset (`bool`, *optional*, defaults to `True`):
|
||||
Whether to scale the image in both negative and positive directions. Can be overriden by the `offset` in
|
||||
Whether to scale the image in both negative and positive directions. Can be overridden by the `offset` in
|
||||
the `preprocess` method.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
|
||||
|
||||
@@ -106,7 +106,7 @@ class Wav2Vec2Config(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -118,7 +118,7 @@ class Wav2Vec2Config(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -1630,7 +1630,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
|
||||
state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
|
||||
self.load_state_dict(state_dict, strict=False)
|
||||
|
||||
# set target language corectly
|
||||
# set target language correctly
|
||||
self.target_lang = target_lang
|
||||
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ class Wav2Vec2CTCTokenizerOutput(ModelOutput):
|
||||
Decoded logits in text from. Usually the speech transcription.
|
||||
char_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
|
||||
Offsets of the decoded characters. In combination with sampling rate and model downsampling rate char
|
||||
offsets can be used to compute time stamps for each charater. Total logit score of the beam associated with
|
||||
offsets can be used to compute time stamps for each character. Total logit score of the beam associated with
|
||||
produced text.
|
||||
word_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
|
||||
Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
|
||||
|
||||
@@ -76,7 +76,7 @@ class Wav2Vec2BertConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -88,7 +88,7 @@ class Wav2Vec2BertConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -103,7 +103,7 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -115,7 +115,7 @@ class Wav2Vec2ConformerConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -146,7 +146,7 @@ class WhisperConfig(PretrainedConfig):
|
||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
|
||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
|
||||
mask_time_length (`int`, *optional*, defaults to 10):
|
||||
@@ -158,7 +158,7 @@ class WhisperConfig(PretrainedConfig):
|
||||
mask_feature_prob (`float`, *optional*, defaults to 0.0):
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
|
||||
may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
|
||||
True`.
|
||||
|
||||
@@ -555,7 +555,7 @@ class PipelineDataFormat:
|
||||
|
||||
if input_path is not None:
|
||||
if not exists(abspath(self.input_path)):
|
||||
raise OSError(f"{self.input_path} doesnt exist on disk")
|
||||
raise OSError(f"{self.input_path} doesn't exist on disk")
|
||||
|
||||
@abstractmethod
|
||||
def __iter__(self):
|
||||
|
||||
@@ -761,7 +761,7 @@ class ProcessorMixin(PushToHubMixin):
|
||||
resolved_additional_chat_template_files = {}
|
||||
if os.path.isfile(pretrained_model_name_or_path):
|
||||
resolved_processor_file = pretrained_model_name_or_path
|
||||
# cant't load chat-template when given a file as pretrained_model_name_or_path
|
||||
# can't load chat-template when given a file as pretrained_model_name_or_path
|
||||
resolved_chat_template_file = None
|
||||
resolved_raw_chat_template_file = None
|
||||
is_local = True
|
||||
|
||||
@@ -2664,7 +2664,7 @@ def hub_retry(max_attempts: int = 5, wait_before_retry: Optional[float] = 2):
|
||||
def run_first(test_case):
|
||||
"""
|
||||
Decorator marking a test with order(1). When pytest-order plugin is installed, tests marked with this decorator
|
||||
are garanteed to run first.
|
||||
are guaranteed to run first.
|
||||
|
||||
This is especially useful in some test settings like on a Gaudi instance where a Gaudi device can only be used by a
|
||||
single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device
|
||||
|
||||
Reference in New Issue
Block a user