From 75c666b4a8463b0b91a181b27f995fbd4bf9f933 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 18 Mar 2022 18:51:24 +0100
Subject: [PATCH] Aggressive PT/TF equivalence test on PT side (#16250)

* Aggressive PT/TF equivalence test on PT side

* Ugly fix for `TFTapasForQuestionAnswering`

* apply review suggestions

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/test_modeling_common.py | 297 +++++++++++++++++++++++++---------
 1 file changed, 220 insertions(+), 77 deletions(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 32331df59c..d3e00d7fbf 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -1463,6 +1463,193 @@ class ModelTesterMixin:
 
         import transformers
 
+        def prepare_tf_inputs_from_pt_inputs(pt_inputs_dict):
+
+            tf_inputs_dict = {}
+            for key, tensor in pt_inputs_dict.items():
+                # skip key that does not exist in tf
+                if type(tensor) == bool:
+                    tf_inputs_dict[key] = tensor
+                elif key == "input_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                elif key == "pixel_values":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                elif key == "input_features":
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                # To deal with the edge cases from `TFTapasForQuestionAnswering`.
+                # PyTorch can deal with type casting automatically, but TensorFlow is more strict!
+                # TODO: find a clean/better way to deal with these extra keys that are not common.
+                elif key in ["float_answer", "numeric_values", "numeric_values_scale"]:
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+                else:
+                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
+
+            return tf_inputs_dict
+
+        def check_outputs(tf_outputs, pt_outputs, model_class, names):
+            """
+            Args:
+                model_class: The class of the model that is currently testing. For example, `TFBertModel`,
+                    TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Currently unused, but it could make
+                    debugging easier and faster.
+
+                names: A string, or a tuple of strings. These specify what tf_outputs/pt_outputs represent in the model outputs.
+                    Currently unused, but in the future, we could use this information to make the error message clearer
+                    by giving the name(s) of the output tensor(s) with large difference(s) between PT and TF.
+            """
+
+            # Some issue (`about past_key_values`) to solve (e.g. `TFPegasusForConditionalGeneration`) in a separate PR.
+            if names == "past_key_values":
+                return
+
+            # Allow `list` because `(TF)TransfoXLModelOutput.mems` is a list of tensors.
+            if type(tf_outputs) in [tuple, list]:
+                self.assertEqual(type(tf_outputs), type(pt_outputs))
+                self.assertEqual(len(tf_outputs), len(pt_outputs))
+                if type(names) == tuple:
+                    for tf_output, pt_output, name in zip(tf_outputs, pt_outputs, names):
+                        check_outputs(tf_output, pt_output, model_class, names=name)
+                elif type(names) == str:
+                    for idx, (tf_output, pt_output) in enumerate(zip(tf_outputs, pt_outputs)):
+                        check_outputs(tf_output, pt_output, model_class, names=f"{names}_{idx}")
+                else:
+                    raise ValueError(f"`names` should be a `tuple` or a string. Got {type(names)} instead.")
+            elif isinstance(tf_outputs, tf.Tensor):
+                self.assertTrue(isinstance(pt_outputs, torch.Tensor))
+
+                tf_outputs = tf_outputs.numpy()
+                pt_outputs = pt_outputs.detach().to("cpu").numpy()
+
+                tf_nans = np.isnan(tf_outputs)
+                pt_nans = np.isnan(pt_outputs)
+
+                pt_outputs[tf_nans] = 0
+                tf_outputs[tf_nans] = 0
+                pt_outputs[pt_nans] = 0
+                tf_outputs[pt_nans] = 0
+
+                max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
+                self.assertLessEqual(max_diff, 1e-5)
+            else:
+                raise ValueError(
+                    f"`tf_outputs` should be a `tuple` or an instance of `tf.Tensor`. Got {type(tf_outputs)} instead."
+                )
+
+        def check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_with_labels):
+
+            # send pytorch model to the correct device
+            pt_model.to(torch_device)
+
+            # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
+            pt_model.eval()
+
+            tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
+            tf_inputs_dict_maybe_with_labels = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict_maybe_with_labels)
+
+            # send pytorch inputs to the correct device
+            pt_inputs_dict = {
+                k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
+            }
+            pt_inputs_dict_maybe_with_labels = {
+                k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v
+                for k, v in pt_inputs_dict_maybe_with_labels.items()
+            }
+
+            # Original test: check without `labels`
+            with torch.no_grad():
+                pt_outputs = pt_model(**pt_inputs_dict)
+            tf_outputs = tf_model(tf_inputs_dict)
+
+            tf_keys = tuple([k for k, v in tf_outputs.items() if v is not None])
+            pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+            self.assertEqual(tf_keys, pt_keys)
+            check_outputs(tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, names=tf_keys)
+
+            # check the case where `labels` is passed
+            has_labels = any(
+                x in tf_inputs_dict_maybe_with_labels for x in ["labels", "next_sentence_label", "start_positions"]
+            )
+            if has_labels:
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs_dict_maybe_with_labels)
+                tf_outputs = tf_model(tf_inputs_dict_maybe_with_labels)
+
+                # Some models' output class don't have `loss` attribute despite `labels` is used.
+                # TODO: identify which models
+                tf_loss = getattr(tf_outputs, "loss", None)
+                pt_loss = getattr(pt_outputs, "loss", None)
+
+                # Some PT models return loss while the corresponding TF models don't (i.e. `None` for `loss`).
+                #   - FlaubertWithLMHeadModel
+                #   - FunnelForPreTraining
+                #   - ElectraForPreTraining
+                #   - XLMWithLMHeadModel
+                # TODO: Fix PT/TF diff -> remove this condition to fail the test if a diff occurs
+                if not ((tf_loss is None and pt_loss is None) or (tf_loss is not None and pt_loss is not None)):
+                    if model_class.__name__ not in [
+                        "FlaubertWithLMHeadModel",
+                        "FunnelForPreTraining",
+                        "ElectraForPreTraining",
+                        "XLMWithLMHeadModel",
+                        "TransfoXLLMHeadModel",
+                    ]:
+                        self.assertEqual(tf_loss is None, pt_loss is None)
+
+                tf_keys = tuple([k for k, v in tf_outputs.items() if v is not None])
+                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+
+                # TODO: remove these 2 conditions once the above TODOs (above loss) are implemented
+                # (Also, `TFTransfoXLLMHeadModel` has no `loss` while `TransfoXLLMHeadModel` return `losses`)
+                if tf_keys != pt_keys:
+                    if model_class.__name__ not in [
+                        "FlaubertWithLMHeadModel",
+                        "FunnelForPreTraining",
+                        "ElectraForPreTraining",
+                        "XLMWithLMHeadModel",
+                        "TransfoXLLMHeadModel",
+                    ]:
+                        self.assertEqual(tf_keys, pt_keys)
+
+                # Since we deliberately make some tests pass above (regarding the `loss`), let's still try to test
+                # some remaining attributes in the outputs.
+                # TODO: remove this block of `index` computing once the above TODOs (above loss) are implemented
+                # compute the 1st `index` where `tf_keys` and `pt_keys` is different
+                index = 0
+                for _ in range(min(len(tf_keys), len(pt_keys))):
+                    if tf_keys[index] == pt_keys[index]:
+                        index += 1
+                    else:
+                        break
+                if tf_keys[:index] != pt_keys[:index]:
+                    self.assertEqual(tf_keys, pt_keys)
+
+                # Some models require extra condition to return loss. For example, `(TF)BertForPreTraining` requires
+                # both`labels` and `next_sentence_label`.
+                if tf_loss is not None and pt_loss is not None:
+
+                    # check anything else than `loss`
+                    keys = tuple([k for k in tf_keys])
+                    check_outputs(tf_outputs[1:index], pt_outputs[1:index], model_class, names=keys[1:index])
+
+                    # check `loss`
+
+                    # tf models returned loss is usually a tensor rather than a scalar.
+                    # (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`)
+                    # Change it here to a scalar to match PyTorch models' loss
+                    tf_loss = tf.math.reduce_mean(tf_loss).numpy()
+                    pt_loss = pt_loss.detach().to("cpu").numpy()
+
+                    tf_nans = np.isnan(tf_loss)
+                    pt_nans = np.isnan(pt_loss)
+                    # the 2 losses need to be both nan or both not nan
+                    self.assertEqual(tf_nans, pt_nans)
+
+                    if not tf_nans:
+                        max_diff = np.amax(np.abs(tf_loss - pt_loss))
+                        self.assertLessEqual(max_diff, 1e-5)
+
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
@@ -1472,9 +1659,30 @@ class ModelTesterMixin:
                 # transformers does not have TF version yet
                 return
 
-            tf_model_class = getattr(transformers, tf_model_class_name)
+            if self.has_attentions:
+                config.output_attentions = True
 
-            config.output_hidden_states = True
+            for k in ["attention_mask", "encoder_attention_mask", "decoder_attention_mask"]:
+                if k in inputs_dict:
+                    attention_mask = inputs_dict[k]
+                    # make sure no all 0s attention masks - to avoid failure at this moment.
+                    # TODO: remove this line once the TODO below is implemented.
+                    attention_mask = torch.ones_like(attention_mask, dtype=torch.int32)
+                    # Here we make the first sequence with all 0s as attention mask.
+                    # Currently, this will fail for `TFWav2Vec2Model`. This is caused by the different large negative
+                    # values, like `1e-4`, `1e-9`, `1e-30` and `-inf` for attention mask across models/frameworks.
+                    # TODO: enable this block once the large negative values thing is cleaned up.
+                    # (see https://github.com/huggingface/transformers/issues/14859)
+                    # attention_mask = torch.cat(
+                    #     [
+                    #         torch.zeros_like(attention_mask[:1], dtype=torch.int32),
+                    #         attention_mask[1:].type(dtype=torch.int32)
+                    #     ],
+                    #     dim=0
+                    # )
+                    inputs_dict[k] = attention_mask
+
+            tf_model_class = getattr(transformers, tf_model_class_name)
 
             tf_model = tf_model_class(config)
             pt_model = model_class(config)
@@ -1487,49 +1695,20 @@ class ModelTesterMixin:
             tf_input_keys.discard("cross_attn_head_mask")
             tf_input_keys.discard("decoder_head_mask")
 
-            pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-            pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys}
+            pt_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            pt_inputs_dict_maybe_with_labels = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
 
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs.items():
-                # skip key that does not exist in tf
-                if type(tensor) == bool:
-                    tf_inputs_dict[key] = tensor
-                elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "input_features":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
+            pt_inputs_dict = {k: v for k, v in pt_inputs_dict.items() if k in tf_input_keys}
+            pt_inputs_dict_maybe_with_labels = {
+                k: v for k, v in pt_inputs_dict_maybe_with_labels.items() if k in tf_input_keys
+            }
 
             # Check we can load pt model in tf and vice-versa with model => model functions
+            tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
             tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
+            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
-            # Make sure PyTorch tensors are on same device as model
-            pt_inputs = {k: v.to(torch_device) if torch.is_tensor(v) else v for k, v in pt_inputs.items()}
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-            tfo = tf_model(tf_inputs_dict, training=False)
-
-            tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].cpu().numpy()
-
-            tf_nans = np.copy(np.isnan(tf_hidden_states))
-            pt_nans = np.copy(np.isnan(pt_hidden_states))
-
-            pt_hidden_states[tf_nans] = 0
-            tf_hidden_states[tf_nans] = 0
-            pt_hidden_states[pt_nans] = 0
-            tf_hidden_states[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
-            self.assertLessEqual(max_diff, 4e-2)
+            check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_with_labels)
 
             # Check we can load pt model in tf and vice-versa with checkpoint => model functions
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -1542,43 +1721,7 @@ class ModelTesterMixin:
                 pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
                 pt_model = pt_model.to(torch_device)
 
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs.items():
-                # skip key that does not exist in tf
-                if type(tensor) == bool:
-                    tensor = np.array(tensor, dtype=bool)
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
-                elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "input_features":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
-
-            # need to rename encoder-decoder "inputs" for PyTorch
-            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
-            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-
-            tfo = tf_model(tf_inputs_dict)
-            tfo = tfo[0].numpy()
-            pto = pto[0].cpu().numpy()
-            tf_nans = np.copy(np.isnan(tfo))
-            pt_nans = np.copy(np.isnan(pto))
-
-            pto[tf_nans] = 0
-            tfo[tf_nans] = 0
-            pto[pt_nans] = 0
-            tfo[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tfo - pto))
-            self.assertLessEqual(max_diff, 4e-2)
+            check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_with_labels)
 
     def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()