[XLNet] Fix mems behavior (#8567)

* fix mems in xlnet * fix use_mems * fix use_mem_len * fix use mems * clean docs * fix tf typo * make xlnet tf for generation work * fix tf test * refactor use cache * add use cache for missing models * correct use_cache in generate * correct use cache in tf generate * fix tf * correct getattr typo * make sylvain happy * change in docs as well * do not apply to cookie cutter statements * fix tf test * make pytorch model fully backward compatible
2020-11-25 22:54:59 +01:00
parent 369f1d77b4
commit 2a6fbe6a40
47 changed files with 259 additions and 134 deletions
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -61,6 +61,9 @@ class CTRLConfig(PretrainedConfig):
            The epsilon to use in the layer normalization layers
        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+

    Examples::

@@ -98,6 +101,7 @@ class CTRLConfig(PretrainedConfig):
        summary_activation=None,
        summary_proj_to_labels=True,
        summary_first_dropout=0.1,
+        use_cache=True,
        **kwargs
    ):
        super().__init__(**kwargs)
@@ -119,6 +123,7 @@ class CTRLConfig(PretrainedConfig):
        self.summary_activation = summary_activation
        self.summary_first_dropout = summary_first_dropout
        self.summary_proj_to_labels = summary_proj_to_labels
+        self.use_cache = use_cache

    @property
    def max_position_embeddings(self):