Release: v4.4.2

Fix distributed evaluation (#10795 )
* Fix distributed evaluation * Use logger
2021-03-18 15:09:04 -04:00 · 2021-03-18 15:07:21 -04:00 · 2021-03-18 15:05:26 -04:00 · 2021-03-18 15:05:17 -04:00
9 changed files with 70 additions and 11 deletions
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,8 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'4.4.1'
+release = u'4.4.2'
+


 # Prefix link to point to master, comment this during version release and uncomment below line
--- a/setup.py
+++ b/setup.py
@@ -278,7 +278,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.4.1",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.4.2",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.4.1"
+__version__ = "4.4.2"

 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -102,8 +102,12 @@ if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VA
                            try:
                                _tf_version = importlib_metadata.version("tf-nightly-gpu")
                            except importlib_metadata.PackageNotFoundError:
-                                _tf_version = None
-                                _tf_available = False
+                                # Support for intel-tensorflow version
+                                try:
+                                    _tf_version = importlib_metadata.version("intel-tensorflow")
+                                except importlib_metadata.PackageNotFoundError:
+                                    _tf_version = None
+                                    _tf_available = False
    if _tf_available:
        if version.parse(_tf_version) < version.parse("2"):
            logger.info(f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum.")
--- a/src/transformers/sagemaker/trainer_sm.py
+++ b/src/transformers/sagemaker/trainer_sm.py
@@ -112,7 +112,12 @@ class SageMakerTrainer(Trainer):

    def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.sampler.Sampler]:
        if self.is_model_parallel_enabled:
-            return SequentialDistributedSampler(eval_dataset, num_replicas=smp.dp_size(), rank=smp.dp_rank())
+            return SequentialDistributedSampler(
+                eval_dataset,
+                num_replicas=smp.dp_size(),
+                rank=smp.dp_rank(),
+                batch_size=self.args.per_device_eval_batch_size,
+            )
        else:
            return super()._get_eval_sampler(eval_dataset)

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -670,7 +670,7 @@ class Trainer:
        """
        Helper to get number of samples in a :class:`~torch.utils.data.DataLoader` by accessing its dataset.

-        Will raise an exception if the underlying dataset dese not implement method :obj:`__len__`
+        Will raise an exception if the underlying dataset does not implement method :obj:`__len__`
        """
        return len(dataloader.dataset)

@@ -1783,8 +1783,13 @@ class Trainer:

        eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size)
        if not prediction_loss_only:
-            preds_gatherer = DistributedTensorGatherer(world_size, num_examples)
-            labels_gatherer = DistributedTensorGatherer(world_size, num_examples)
+            # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass
+            # a batch size to the sampler)
+            make_multiple_of = None
+            if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler):
+                make_multiple_of = dataloader.sampler.batch_size
+            preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)
+            labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of)

        model.eval()

--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -220,7 +220,7 @@ class SequentialDistributedSampler(Sampler):
    or `reduce` resulting tensors at the end of the loop.
    """

-    def __init__(self, dataset, num_replicas=None, rank=None):
+    def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None):
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
@@ -232,8 +232,14 @@ class SequentialDistributedSampler(Sampler):
        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
-        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        num_samples = len(self.dataset)
+        # Add extra samples to make num_samples a multiple of batch_size if passed
+        if batch_size is not None:
+            self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size
+        else:
+            self.num_samples = int(math.ceil(num_samples / num_replicas))
        self.total_size = self.num_samples * self.num_replicas
+        self.batch_size = batch_size

    def __iter__(self):
        indices = list(range(len(self.dataset)))
--- a/tests/test_trainer_distributed.py
+++ b/tests/test_trainer_distributed.py
@@ -97,6 +97,11 @@ if __name__ == "__main__":
        def compute_metrics(p: EvalPrediction) -> Dict:
            sequential = list(range(len(dataset)))
            success = p.predictions.tolist() == sequential and p.label_ids.tolist() == sequential
+            if not success and training_args.local_rank == 0:
+                logger.warning(
+                    "Predictions and/or labels do not match expected results:\n  - predictions: "
+                    f"{p.predictions.tolist()}\n  - labels: {p.label_ids.tolist()}\n  - expected: {sequential}"
+                )
            return {"success": success}

        trainer = Trainer(
--- a/tests/test_trainer_utils.py
+++ b/tests/test_trainer_utils.py
@@ -31,6 +31,7 @@ if is_torch_available():
        DistributedTensorGatherer,
        LabelSmoother,
        LengthGroupedSampler,
+        SequentialDistributedSampler,
        get_parameter_names,
    )

@@ -167,3 +168,35 @@ class TrainerUtilsTest(unittest.TestCase):

            self.assertEqual(set(total[:length]), set(dataset))
            self.assertEqual(set(total[length:]), set(total[: (len(total) - length)]))
+
+    def test_sequential_distributed_sampler(self):
+        batch_size = 16
+        for length in [23, 64, 123]:
+            dataset = list(range(length))
+            shard1 = SequentialDistributedSampler(dataset, num_replicas=2, rank=0)
+            shard2 = SequentialDistributedSampler(dataset, num_replicas=2, rank=1)
+
+            # Sample
+            samples1 = list(shard1)
+            samples2 = list(shard2)
+
+            total = samples1 + samples2
+
+            self.assertListEqual(total[:length], dataset)
+            self.assertListEqual(total[length:], dataset[: (len(total) - length)])
+
+            # With a batch_size passed
+            shard1 = SequentialDistributedSampler(dataset, num_replicas=2, rank=0, batch_size=batch_size)
+            shard2 = SequentialDistributedSampler(dataset, num_replicas=2, rank=1, batch_size=batch_size)
+
+            # Sample
+            samples1 = list(shard1)
+            samples2 = list(shard2)
+
+            self.assertTrue(len(samples1) % batch_size == 0)
+            self.assertTrue(len(samples2) % batch_size == 0)
+
+            total = samples1 + samples2
+
+            self.assertListEqual(total[:length], dataset)
+            self.assertListEqual(total[length:], dataset[: (len(total) - length)])
Author	SHA1	Message	Date
Sylvain Gugger	9f43a425fe	Release: v4.4.2 Some checks failed Release - Conda / build_and_package (push) Has been cancelled Details	2021-03-18 15:09:04 -04:00
Sylvain Gugger	45dae78e61	Fix distributed evaluation (#10795 ) * Fix distributed evaluation * Use logger	2021-03-18 15:07:21 -04:00
Mansi Mane	12b04b5003	Smmp batch not divisible by microbatches fix (#10778 ) * Added debug prints * Added config * Added prints * Added prints * Added extra samples to SequentialDistributedSampler * Added extra samples to SequentialDistributedSampler Updated SequentialDistributedSampler call * Added deubg prints * Removed extra prints * Making predicitons and labels multiple of batchsize * updated number of microbatches * Removed extra prints * Made start_remainder similar to DistributedSamplerWithLoop * Minor spacing update * Added debug prints Added config Added prints Added prints * Added extra samples to SequentialDistributedSampler Updated SequentialDistributedSampler call Added extra samples to SequentialDistributedSampler Added deubg prints Removed extra prints Making predicitons and labels multiple of batchsize updated number of microbatches Removed extra prints Squashing redundant commits * Made start_remainder similar to DistributedSamplerWithLoop Minor spacing update Made start_remainder similar to DistributedSamplerWithLoop * Test and styling * Rename test Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>	2021-03-18 15:05:26 -04:00
Funtowicz Morgan	6460e9a0f3	Add support for detecting intel-tensorflow version (#10781 ) Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>	2021-03-18 15:05:17 -04:00