Fix inclusion of non py files in package (#21546 )

* Fix inclusion of non py files in package * No need for the **
Fix import in Accelerate for find_exec_bs (#21501 )
2023-02-09 14:17:16 -05:00 · 2023-02-09 12:47:06 -05:00 · 2023-02-09 11:48:52 -05:00 · 2023-02-09 11:48:27 -05:00 · 2023-02-09 11:48:11 -05:00
8 changed files with 82 additions and 7 deletions
--- a/setup.py
+++ b/setup.py
@@ -413,7 +413,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.26.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.26.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
@@ -424,7 +424,8 @@ setup(
    url="https://github.com/huggingface/transformers",
    package_dir={"": "src"},
    packages=find_packages("src"),
-    package_data={"transformers": ["py.typed", "*.cu", "*.cpp", "*.cuh", "*.h"]},
+    include_package_data=True,
+    package_data={"transformers": ["*.cu", "*.cpp", "*.cuh", "*.h", "*.pyx"]},
    zip_safe=False,
    extras_require=extras,
    entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]},
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.26.0"
+__version__ = "4.26.1"

 from typing import TYPE_CHECKING

--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -276,6 +276,8 @@ class LongT5DenseActDense(nn.Module):
        hidden_states = self.wi(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dropout(hidden_states)
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
        hidden_states = self.wo(hidden_states)
        return hidden_states

--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -146,6 +146,8 @@ class MT5DenseActDense(nn.Module):
        hidden_states = self.wi(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dropout(hidden_states)
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
        hidden_states = self.wo(hidden_states)
        return hidden_states

@@ -168,7 +170,8 @@ class MT5DenseGatedActDense(nn.Module):

        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
        # See https://github.com/huggingface/transformers/issues/20287
-        if hidden_states.dtype != self.wo.weight.dtype:
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
            hidden_states = hidden_states.to(self.wo.weight.dtype)

        hidden_states = self.wo(hidden_states)
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -273,6 +273,8 @@ class SwitchTransformersDenseActDense(nn.Module):
        hidden_states = self.wi(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dropout(hidden_states)
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
        hidden_states = self.wo(hidden_states)
        return hidden_states

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -289,6 +289,8 @@ class T5DenseActDense(nn.Module):
        hidden_states = self.wi(hidden_states)
        hidden_states = self.act(hidden_states)
        hidden_states = self.dropout(hidden_states)
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
        hidden_states = self.wo(hidden_states)
        return hidden_states

@@ -310,7 +312,8 @@ class T5DenseGatedActDense(nn.Module):

        # To make 8bit quantization work for google/flan-t5-xxl, self.wo is kept in float32.
        # See https://github.com/huggingface/transformers/issues/20287
-        if hidden_states.dtype != self.wo.weight.dtype:
+        # we also make sure the weights are not in `int8` in case users will force `_keep_in_fp32_modules` to be `None``
+        if hidden_states.dtype != self.wo.weight.dtype and self.wo.weight.dtype != torch.int8:
            hidden_states = hidden_states.to(self.wo.weight.dtype)

        hidden_states = self.wo(hidden_states)
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -646,9 +646,9 @@ def find_executable_batch_size(

    if auto_find_batch_size:
        requires_backends(find_executable_batch_size, "accelerate")
-        import accelerate.memory_utils as mem_utils
+        from accelerate.utils import find_executable_batch_size as accelerate_find_executable_batch_size

-        return mem_utils.find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)
+        return accelerate_find_executable_batch_size(function=function, starting_batch_size=starting_batch_size)

    return functools.partial(function, batch_size=starting_batch_size)

--- a/tests/mixed_int8/test_mixed_int8.py
+++ b/tests/mixed_int8/test_mixed_int8.py
@@ -163,6 +163,70 @@ class MixedInt8Test(BaseMixedInt8Test):
        self.assertTrue(model.decoder.block[0].layer[2].DenseReluDense.wo.weight.dtype == torch.float32)


+@require_bitsandbytes
+@require_accelerate
+@require_torch
+@require_torch_gpu
+@slow
+class MixedInt8T5Test(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_name = "t5-small"
+        cls.dense_act_model_name = "google/flan-t5-small"  # flan-t5 uses dense-act instead of dense-relu-dense
+        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
+        cls.input_text = "Translate in German: Hello, my dog is cute"
+
+    def tearDown(self):
+        r"""
+        TearDown function needs to be called at the end of each test to free the GPU memory and cache, also to
+        avoid unexpected behaviors. Please see: https://discuss.pytorch.org/t/how-can-we-release-gpu-memory-cache/14530/27
+        """
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_inference_without_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        from transformers import T5ForConditionalGeneration
+
+        T5ForConditionalGeneration._keep_in_fp32_modules = None
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_8bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+    def test_inference_with_keep_in_fp32(self):
+        r"""
+        Test whether it is possible to mix both `int8` and `fp32` weights when using `keep_in_fp32_modules` correctly.
+        `flan-t5-small` uses `T5DenseGatedActDense` whereas `t5-small` uses `T5DenseReluDense`. We need to test
+        both cases.
+        """
+        from transformers import T5ForConditionalGeneration
+
+        # test with `t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto")
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+        # test with `flan-t5-small`
+        model = T5ForConditionalGeneration.from_pretrained(
+            self.dense_act_model_name, load_in_8bit=True, device_map="auto"
+        )
+        encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(0)
+        _ = model.generate(**encoded_input)
+
+
 class MixedInt8ModelClassesTest(BaseMixedInt8Test):
    def setUp(self):
        super().setUp()
Author	SHA1	Message	Date
Sylvain Gugger	ae54e3c3b1	Fix inclusion of non py files in package (#21546 ) Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details * Fix inclusion of non py files in package * No need for the **	2023-02-09 14:17:16 -05:00
Sylvain Gugger	865bb4e936	Fix import in Accelerate for find_exec_bs (#21501 )	2023-02-09 12:47:06 -05:00
Sylvain Gugger	02c3f4145e	Release: v4.26.1	2023-02-09 11:48:52 -05:00
Younes Belkada	1139260900	[`t5`] Fix T5 inference in `float16` + `bnb` error (#21281 ) * attempts to fix: - upcast input for `T5DenseActDense` - add the condition `self.wo.weight.dtype != torch.int8` - added tests on `test/mixed_int8` - `make fixup` * fix ci test	2023-02-09 11:48:27 -05:00
Sylvain Gugger	38620e1839	Add cPython files in build (#21372 )	2023-02-09 11:48:11 -05:00