diff --git a/README.md b/README.md
index e866506c6d..8265c5d246 100644
--- a/README.md
+++ b/README.md
@@ -603,25 +603,25 @@ Transformer XL use a relative positioning with sinusiodal patterns and adaptive
 
 This model takes as *inputs*:
 [`modeling_transfo_xl.py`](./pytorch_pretrained_bert/modeling_transfo_xl.py)
-- `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size] with the token indices selected in the range [0, self.config.n_token[
-- `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the token indices selected in the range [0, self.config.n_token[
+- `mems`: an optional memory of hidden states from previous forward passes as a list (num layers) of hidden states at the entry of each layer. Each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 This model *outputs* a tuple of (last_hidden_state, new_mems)
-- `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [sequence_length, batch_size, self.config.d_model]
-- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+- `last_hidden_state`: the encoded-hidden-states at the top of the model as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
+- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 #### 13. `TransfoXLLMHeadModel`
 
 `TransfoXLLMHeadModel` includes the `TransfoXLModel` Transformer followed by an (adaptive) softmax head with weights tied to the input embeddings.
 
 *Inputs* are the same as the inputs of the [`TransfoXLModel`](#-12.-`TransfoXLModel`) class plus optional labels:
-- `target`: an optional torch.LongTensor of shape [sequence_length, batch_size] with the target token indices selected in the range [0, self.config.n_token[
+- `target`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the target token indices selected in the range [0, self.config.n_token[
 
 *Outputs* a tuple of (last_hidden_state, new_mems)
 - `softmax_output`: output of the (adaptive) softmax:
-  - if target is None: Negative log likelihood of shape :: [len, bsz]
-  - else: log probabilities of tokens, shape :: [len, bsz, n_tokens]
-- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+  - if target is None: Negative log likelihood of shape [batch_size, sequence_length]
+  - else: log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
+- `new_mems`: list (num layers) of updated mem states at the entry of each layer each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]. Note that the first two dimensions are transposed in `mems` with regards to `input_ids`.
 
 
 ### Tokenizers:
diff --git a/pytorch_pretrained_bert/modeling_transfo_xl.py b/pytorch_pretrained_bert/modeling_transfo_xl.py
index f3498944f5..714a9d9846 100644
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -986,17 +986,19 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         config: a TransfoXLConfig class instance with the configuration to build a new model
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the token indices selected in the range [0, self.config.n_token[
         `mems`: optional memomry of hidden states from previous forward passes
             as a list (num layers) of hidden states at the entry of each layer
             each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
     Outputs:
         A tuple of (last_hidden_state, new_mems)
         `last_hidden_state`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [sequence_length, batch_size, self.config.d_model]
+            as a torch.FloatTensor of size [batch_size, sequence_length, self.config.d_model]
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
 
     Example usage:
     ```python
@@ -1225,20 +1227,28 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
     def forward(self, input_ids, mems=None):
         """ Params:
-                input_ids :: [len, bsz]
+                input_ids :: [bsz, len]
                 mems :: optional mems from previous forwar passes (or init_mems)
                     list (num layers) of mem states at the entry of each layer
                         shape :: [self.config.mem_len, bsz, self.config.d_model]
+                    Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
             Returns:
                 tuple (last_hidden, new_mems) where:
                     new_mems: list (num layers) of mem states at the entry of each layer
                         shape :: [self.config.mem_len, bsz, self.config.d_model]
                     last_hidden: output of the last layer:
-                        shape :: [len, bsz, self.config.d_model]
+                        shape :: [bsz, len, self.config.d_model]
         """
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        input_ids = input_ids.transpose(0, 1).contiguous()
+
         if mems is None:
             mems = self.init_mems(input_ids)
         last_hidden, new_mems = self._forward(input_ids, mems=mems)
+
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        last_hidden = last_hidden.transpose(0, 1).contiguous()
         return (last_hidden, new_mems)
 
 
@@ -1257,23 +1267,25 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         config: a TransfoXLConfig class instance with the configuration to build a new model
 
     Inputs:
-        `input_ids`: a torch.LongTensor of shape [sequence_length, batch_size]
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
             with the token indices selected in the range [0, self.config.n_token[
-        `target`: an optional torch.LongTensor of shape [sequence_length, batch_size]
+        `target`: an optional torch.LongTensor of shape [batch_size, sequence_length]
             with the target token indices selected in the range [0, self.config.n_token[
         `mems`: an optional memory of hidden states from previous forward passes
             as a list (num layers) of hidden states at the entry of each layer
             each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
 
     Outputs:
         A tuple of (last_hidden_state, new_mems)
         `softmax_output`: output of the (adaptive) softmax:
             if target is None:
-                Negative log likelihood of shape :: [len, bsz] 
+                Negative log likelihood of shape [batch_size, sequence_length] 
             else:
-                log probabilities of tokens, shape :: [len, bsz, n_tokens]
+                log probabilities of tokens, shape [batch_size, sequence_length, n_tokens]
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
 
     Example usage:
     ```python
@@ -1287,7 +1299,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
     last_hidden_state, new_mems = model(input_ids)
 
     # Another time on input_ids_next using the memory:
-    last_hidden_state, new_mems = model(input_ids_next, new_mems)
+    last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
     ```
     """
     def __init__(self, config):
@@ -1331,33 +1343,34 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
 
     def forward(self, input_ids, target=None, mems=None):
         """ Params:
-                input_ids :: [len, bsz]
-                target :: [len, bsz]
+                input_ids :: [bsz, len]
+                target :: [bsz, len]
             Returns:
                 tuple(softmax_output, new_mems) where:
                     new_mems: list (num layers) of hidden states at the entry of each layer
-                        shape :: [mem_len, bsz, self.config.d_model]
+                        shape :: [mem_len, bsz, self.config.d_model] :: Warning: shapes are transposed here w. regards to input_ids
                     softmax_output: output of the (adaptive) softmax:
                         if target is None:
-                            Negative log likelihood of shape :: [len, bsz] 
+                            Negative log likelihood of shape :: [bsz, len] 
                         else:
-                            log probabilities of tokens, shape :: [len, bsz, n_tokens]
+                            log probabilities of tokens, shape :: [bsz, len, n_tokens]
         """
-        bsz = input_ids.size(1)
-        tgt_len = input_ids.size(0)
+        bsz = input_ids.size(0)
+        tgt_len = input_ids.size(1)
 
         last_hidden, new_mems = self.transformer(input_ids, mems)
 
-        pred_hid = last_hidden[-tgt_len:]
+        pred_hid = last_hidden[:, -tgt_len:]
         if self.sample_softmax > 0 and self.training:
             assert self.config.tie_weight
             logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, target, pred_hid, self.sampler)
-            loss = -F.log_softmax(logit, -1)[:, :, 0]
+            softmax_output = -F.log_softmax(logit, -1)[:, :, 0]
         else:
             softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target)
             if target is None:
-                softmax_output = softmax_output.view(tgt_len, bsz, -1)
+                softmax_output = softmax_output.view(bsz, tgt_len, -1)
             else:
-                softmax_output = softmax_output.view(tgt_len, bsz)
+                softmax_output = softmax_output.view(bsz, tgt_len)
 
+        # We transpose back
         return (softmax_output, new_mems)
diff --git a/pytorch_pretrained_bert/tokenization_transfo_xl.py b/pytorch_pretrained_bert/tokenization_transfo_xl.py
index 698deae21c..585a815923 100644
--- a/pytorch_pretrained_bert/tokenization_transfo_xl.py
+++ b/pytorch_pretrained_bert/tokenization_transfo_xl.py
@@ -507,7 +507,7 @@ class TransfoXLCorpus(object):
             resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir)
         except EnvironmentError:
             logger.error(
-                "Model name '{}' was not found in model name list ({}). "
+                "Corpus '{}' was not found in corpus list ({}). "
                 "We assumed '{}' was a path or url but couldn't find files {} "
                 "at this path or url.".format(
                     pretrained_model_name_or_path,
diff --git a/tests/modeling_transfo_xl_test.py b/tests/modeling_transfo_xl_test.py
index 0bc16daf4c..291d5d9d2a 100644
--- a/tests/modeling_transfo_xl_test.py
+++ b/tests/modeling_transfo_xl_test.py
@@ -67,12 +67,12 @@ class TransfoXLModelTest(unittest.TestCase):
             self.seed = seed
 
         def prepare_config_and_inputs(self):
-            input_ids_1 = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
-            input_ids_2 = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+            input_ids_1 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_ids_2 = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             lm_labels = None
             if self.use_labels:
-                lm_labels = TransfoXLModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
+                lm_labels = TransfoXLModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = TransfoXLConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -110,13 +110,13 @@ class TransfoXLModelTest(unittest.TestCase):
         def check_transfo_xl_model_output(self, result):
             self.parent.assertListEqual(
                 list(result["hidden_states_1"].size()),
-                [self.seq_length, self.batch_size, self.d_model])
+                [self.batch_size, self.seq_length, self.d_model])
+            self.parent.assertListEqual(
+                list(result["hidden_states_2"].size()),
+                [self.batch_size, self.seq_length, self.d_model])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(result["hidden_states_2"].size()),
-                [self.seq_length, self.batch_size, self.d_model])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
@@ -147,13 +147,13 @@ class TransfoXLModelTest(unittest.TestCase):
         def check_transfo_xl_lm_head_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss_1"].size()),
-                [self.seq_length, self.batch_size])
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_1"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1a"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(result["lm_logits_1"].size()),
-                [self.seq_length, self.batch_size, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_1b"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
@@ -163,13 +163,13 @@ class TransfoXLModelTest(unittest.TestCase):
 
             self.parent.assertListEqual(
                 list(result["loss_2"].size()),
-                [self.seq_length, self.batch_size])
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["lm_logits_2"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2a"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
-            self.parent.assertListEqual(
-                list(result["lm_logits_2"].size()),
-                [self.seq_length, self.batch_size, self.vocab_size])
             self.parent.assertListEqual(
                 list(list(mem.size()) for mem in result["mems_2b"]),
                 [[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)