Doc styling (#8067)

* Important files * Styling them all * Revert "Styling them all" This reverts commit 7d029395fdae8513b8281cbc2a6c239f8093503e. * Syling them for realsies * Fix syntax error * Fix benchmark_utils * More fixes * Fix modeling auto and script * Remove new line * Fixes * More fixes * Fix more files * Style * Add FSMT * More fixes * More fixes * More fixes * More fixes * Fixes * More fixes * More fixes * Last fixes * Make sphinx happy
2020-10-26 18:26:02 -04:00
parent 04a17f8550
commit 08f534d2da
271 changed files with 9726 additions and 8991 deletions
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -135,14 +135,12 @@ def torch_distributed_zero_first(local_rank: int):

 class SequentialDistributedSampler(Sampler):
    """
-    Distributed Sampler that subsamples indicies sequentially,
-    making it easier to collate all results at the end.
+    Distributed Sampler that subsamples indicies sequentially, making it easier to collate all results at the end.

-    Even though we only use this sampler for eval and predict (no training),
-    which means that the model params won't have to be synced (i.e. will not hang
-    for synchronization even if varied number of forward passes), we still add extra
-    samples to the sampler to make it evenly divisible (like in `DistributedSampler`)
-    to make it easy to `gather` or `reduce` resulting tensors at the end of the loop.
+    Even though we only use this sampler for eval and predict (no training), which means that the model params won't
+    have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add
+    extra samples to the sampler to make it evenly divisible (like in `DistributedSampler`) to make it easy to `gather`
+    or `reduce` resulting tensors at the end of the loop.
    """

    def __init__(self, dataset, num_replicas=None, rank=None):
@@ -203,16 +201,15 @@ def nested_truncate(tensors, limit):

 class DistributedTensorGatherer:
    """
-    A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU
-    by chunks.
+    A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks.

-    If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on
-    CPU at every step, our sampler will generate the following indices:
+    If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on CPU at every
+    step, our sampler will generate the following indices:

        :obj:`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`

-    to get something of size a multiple of 3 (so that each process gets the same dataset length). Then
-    process 0, 1 and 2 will be responsible of making predictions for the following samples:
+    to get something of size a multiple of 3 (so that each process gets the same dataset length). Then process 0, 1 and
+    2 will be responsible of making predictions for the following samples:

        - P0: :obj:`[0, 1, 2, 3, 4, 5]`
        - P1: :obj:`[6, 7, 8, 9, 10, 11]`
@@ -224,13 +221,13 @@ class DistributedTensorGatherer:
        - P1: :obj:`[6, 7]`
        - P2: :obj:`[12, 13]`

-    So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor)
-    corresponding to the following indices:
+    So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) corresponding to
+    the following indices:

        :obj:`[0, 1, 6, 7, 12, 13]`

-    If we directly concatenate our results without taking any precautions, the user will then get
-    the predictions for the indices in this order at the end of the prediction loop:
+    If we directly concatenate our results without taking any precautions, the user will then get the predictions for
+    the indices in this order at the end of the prediction loop:

        :obj:`[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`