Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ae54e3c3b1 | ||
|
|
865bb4e936 | ||
|
|
02c3f4145e | ||
|
|
1139260900 | ||
|
|
38620e1839 |
5
setup.py
5
setup.py
@@ -413,7 +413,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="transformers",
|
||||
version="4.26.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="4.26.1", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
|
||||
author_email="transformers@huggingface.co",
|
||||
description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
|
||||
@@ -424,7 +424,8 @@ setup(
|
||||
url="https://github.com/huggingface/transformers",
|
||||
package_dir={"": "src"},
|
||||
packages=find_packages("src"),
|
||||
package_data={"transformers": ["py.typed", "*.cu", "*.cpp", "*.cuh", "*.h"]},
|
||||
include_package_data=True,
|
||||
package_data={"transformers": ["*.cu", "*.cpp", "*.cuh", "*.h", "*.pyx"]},
|
||||
zip_safe=False,
|
||||
extras_require=extras,
|
||||
entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
||||
# in the namespace without actually importing anything (and especially none of the backends).
|
||||
|
||||
__version__ = "4.26.0"
|
||||
__version__ = "4.26.1"
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
|
||||
@@ -276,6 +276,8 @@ class LongT5DenseActDense(nn.Module):
|
||||
hidden_states = self.wi(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
|
||||
hidden_states = hidden_states.to(self.wo.weight.dtype)
|
||||
hidden_states = self.wo(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
@@ -146,6 +146,8 @@ class MT5DenseActDense(nn.Module):
|
||||
hidden_states = self.wi(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
|
||||
hidden_states = hidden_states.to(self.wo.weight.dtype)
|
||||
hidden_states = self.wo(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
@@ -168,7 +170,8 @@ class MT5DenseGatedActDense(nn.Module):
|
||||
|
||||
# To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
|
||||
# See https://github.com/huggingface/transformers/issues/20287
|
||||
if hidden_states.dtype != self.wo.weight.dtype:
|
||||
# we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
|
||||
if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
|
||||
hidden_states = hidden_states.to(self.wo.weight.dtype)
|
||||
|
||||
hidden_states = self.wo(hidden_states)
|
||||
|
||||
@@ -273,6 +273,8 @@ class SwitchTransformersDenseActDense(nn.Module):
|
||||
hidden_states = self.wi(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
|
||||
hidden_states = hidden_states.to(self.wo.weight.dtype)
|
||||
hidden_states = self.wo(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
@@ -289,6 +289,8 @@ class T5DenseActDense(nn.Module):
|
||||
hidden_states = self.wi(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
|
||||
hidden_states = hidden_states.to(self.wo.weight.dtype)
|
||||
hidden_states = self.wo(hidden_states)
|
||||
return hidden_states
|
||||
|
||||
@@ -310,7 +312,8 @@ class T5DenseGatedActDense(nn.Module):
|
||||
|
||||
# To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
|
||||
# See https://github.com/huggingface/transformers/issues/20287
|
||||
if hidden_states.dtype != self.wo.weight.dtype:
|
||||
# we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
|
||||
if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
|
||||
hidden_states = hidden_states.to(self.wo.weight.dtype)
|
||||
|
||||
hidden_states = self.wo(hidden_states)
|
||||
|
||||
@@ -646,9 +646,9 @@ def find_executable_batch_size(
|
||||
|
||||
if auto_find_batch_size:
|
||||
requires_backends(find_executable_batch_size, "accelerate")
|
||||
import accelerate.memory_utils as mem_utils
|
||||
from accelerate.utils import find_executable_batch_size as accelerate_find_executable_batch_size
|
||||
|
||||
return mem_utils.find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)
|
||||
return accelerate_find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)
|
||||
|
||||
return functools.partial(function, batch_size=starting_batch_size)
|
||||
|
||||
|
||||
@@ -163,6 +163,70 @@ class MixedInt8Test(BaseMixedInt8Test):
|
||||
self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)
|
||||
|
||||
|
||||
@require_bitsandbytes
|
||||
@require_accelerate
|
||||
@require_torch
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
class MixedInt8T5Test(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.model_name = "t5-small"
|
||||
cls.dense_act_model_name = "google/flan-t5-small" # flan-t5 uses dense-act instead of dense-relu-dense
|
||||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
|
||||
cls.input_text = "Translate in German: Hello, my dog is cute"
|
||||
|
||||
def tearDown(self):
|
||||
r"""
|
||||
TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
|
||||
avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
|
||||
"""
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def test_inference_without_keep_in_fp32(self):
|
||||
r"""
|
||||
Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
|
||||
`flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
|
||||
both cases.
|
||||
"""
|
||||
from transformers import T5ForConditionalGeneration
|
||||
|
||||
T5ForConditionalGeneration._keep_in_fp32_modules = None
|
||||
|
||||
# test with `t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
|
||||
_ = model.generate(**encoded_input)
|
||||
|
||||
# test with `flan-t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(
|
||||
self.dense_act_model_name, load_in_8bit=True, device_map="auto"
|
||||
)
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
|
||||
_ = model.generate(**encoded_input)
|
||||
|
||||
def test_inference_with_keep_in_fp32(self):
|
||||
r"""
|
||||
Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
|
||||
`flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
|
||||
both cases.
|
||||
"""
|
||||
from transformers import T5ForConditionalGeneration
|
||||
|
||||
# test with `t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
|
||||
_ = model.generate(**encoded_input)
|
||||
|
||||
# test with `flan-t5-small`
|
||||
model = T5ForConditionalGeneration.from_pretrained(
|
||||
self.dense_act_model_name, load_in_8bit=True, device_map="auto"
|
||||
)
|
||||
encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
|
||||
_ = model.generate(**encoded_input)
|
||||
|
||||
|
||||
class MixedInt8ModelClassesTest(BaseMixedInt8Test):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
Reference in New Issue
Block a user