Compare commits

...

2 Commits

Author SHA1 Message Date
Sylvain Gugger
4e9f6fc67c Patch release: v4.27.8
Some checks failed
Release - Conda / build_and_package (push) Has been cancelled
2023-03-29 11:42:29 -04:00
Sylvain Gugger
4277b3dd46 Revert "Error (also in original) model, scaling only q matrix not qk.T dot product (qk.T/sqrt(dim_per_head))" (#22444)
Revert "Error (also in original) model, scaling only q matrix not qk.T dot product (qk.T/sqrt(dim_per_head)) (#21627)"

This reverts commit bad8300837.
2023-03-29 11:42:09 -04:00
4 changed files with 6 additions and 4 deletions

View File

@@ -418,7 +418,7 @@ install_requires = [
setup(
name="transformers",
version="4.27.3", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
version="4.27.4", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
author_email="transformers@huggingface.co",
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",

View File

@@ -18,7 +18,7 @@
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
# in the namespace without actually importing anything (and especially none of the backends).
__version__ = "4.27.3"
__version__ = "4.27.4"
from typing import TYPE_CHECKING

View File

@@ -172,7 +172,8 @@ class MultiHeadAttention(nn.Module):
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(dim_per_head) # (bs, n_heads, qlen, klen)
q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head)
scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen)
mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen)
scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen)

View File

@@ -176,7 +176,8 @@ class MultiHeadAttention(nn.Module):
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(dim_per_head) # (bs, n_heads, qlen, klen)
q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head)
scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen)
mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen)
scores.masked_fill_(mask, torch.finfo(scores.dtype).min) # (bs, n_heads, qlen, klen)