Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4e9f6fc67c | ||
|
|
4277b3dd46 | ||
|
|
5e3b19a805 | ||
|
|
62d9baa53c | ||
|
|
68287689f2 | ||
|
|
1e39734c4b |
2
setup.py
2
setup.py
@@ -418,7 +418,7 @@ install_requires = [
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="transformers",
|
name="transformers",
|
||||||
version="4.27.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
version="4.27.4", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||||
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
|
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
|
||||||
author_email="transformers@huggingface.co",
|
author_email="transformers@huggingface.co",
|
||||||
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
|
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
|
||||||
|
|||||||
@@ -18,7 +18,7 @@
|
|||||||
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
||||||
# in the namespace without actually importing anything (and especially none of the backends).
|
# in the namespace without actually importing anything (and especially none of the backends).
|
||||||
|
|
||||||
__version__ = "4.27.1"
|
__version__ = "4.27.4"
|
||||||
|
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
|||||||
@@ -2563,7 +2563,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
elif device_map in ["balanced", "balanced_low_0"] and get_balanced_memory is None:
|
elif device_map in ["balanced", "balanced_low_0"] and get_balanced_memory is None:
|
||||||
raise ValueError(f"`device_map={device_map}` requires a source install of Accelerate.")
|
raise ValueError(f"`device_map={device_map}` requires a source install of Accelerate.")
|
||||||
|
|
||||||
kwargs = {"no_split_module_classes": no_split_modules, "max_memory": max_memory}
|
kwargs = {"no_split_module_classes": no_split_modules}
|
||||||
if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
|
if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters:
|
||||||
kwargs["special_dtypes"] = special_dtypes
|
kwargs["special_dtypes"] = special_dtypes
|
||||||
elif len(special_dtypes) > 0:
|
elif len(special_dtypes) > 0:
|
||||||
@@ -2576,8 +2576,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
model,
|
model,
|
||||||
dtype=torch_dtype,
|
dtype=torch_dtype,
|
||||||
low_zero=(device_map == "balanced_low_0"),
|
low_zero=(device_map == "balanced_low_0"),
|
||||||
|
max_memory=max_memory,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
kwargs["max_memory"] = max_memory
|
||||||
# Make sure tied weights are tied before creating the device map.
|
# Make sure tied weights are tied before creating the device map.
|
||||||
model.tie_weights()
|
model.tie_weights()
|
||||||
device_map = infer_auto_device_map(model, dtype=torch_dtype if not load_in_8bit else torch.int8, **kwargs)
|
device_map = infer_auto_device_map(model, dtype=torch_dtype if not load_in_8bit else torch.int8, **kwargs)
|
||||||
|
|||||||
@@ -172,7 +172,8 @@ class MultiHeadAttention(nn.Module):
|
|||||||
k, v = cache[self.layer_id]
|
k, v = cache[self.layer_id]
|
||||||
cache[self.layer_id] = (k, v)
|
cache[self.layer_id] = (k, v)
|
||||||
|
|
||||||
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(dim_per_head) # (bs, n_heads, qlen, klen)
|
q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen)
|
||||||
mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen)
|
mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen)
|
||||||
scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen)
|
scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
|||||||
@@ -176,7 +176,8 @@ class MultiHeadAttention(nn.Module):
|
|||||||
k, v = cache[self.layer_id]
|
k, v = cache[self.layer_id]
|
||||||
cache[self.layer_id] = (k, v)
|
cache[self.layer_id] = (k, v)
|
||||||
|
|
||||||
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(dim_per_head) # (bs, n_heads, qlen, klen)
|
q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head)
|
||||||
|
scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen)
|
||||||
mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen)
|
mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen)
|
||||||
scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen)
|
scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user