[XLNet] Fix mems behavior (#8567)

* fix mems in xlnet * fix use_mems * fix use_mem_len * fix use mems * clean docs * fix tf typo * make xlnet tf for generation work * fix tf test * refactor use cache * add use cache for missing models * correct use_cache in generate * correct use cache in tf generate * fix tf * correct getattr typo * make sylvain happy * change in docs as well * do not apply to cookie cutter statements * fix tf test * make pytorch model fully backward compatible
2020-11-25 22:54:59 +01:00
parent 369f1d77b4
commit 2a6fbe6a40
47 changed files with 259 additions and 134 deletions
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -888,7 +888,7 @@ class BertModel(BertPreTrainedModel):

@add_start_docstrings(
    """
-    Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and a `next
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    """,
    BERT_START_DOCSTRING,
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -90,7 +90,7 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [

 class TFBertPreTrainingLoss:
    """
-    Loss function suitable for BERT-like pre-training, that is, the task of pretraining a language model by combining
+    Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining
    NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss
    computation.
    """
@@ -878,7 +878,7 @@ class TFBertModel(TFBertPreTrainedModel):

@add_start_docstrings(
    """
-Bert Model with two heads on top as done during the pre-training:
+Bert Model with two heads on top as done during the pretraining:
    a `masked language modeling` head and a `next sentence prediction (classification)` head.
    """,
    BERT_START_DOCSTRING,