diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
index 8c77efd3f9..4cca4eb846 100644
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -96,7 +96,7 @@ where
   ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
 
 * ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
-* ``state_dict``\ : an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+* ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
 * ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 
 ``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index fb947ffb51..be5197135d 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -40,7 +40,7 @@ where
 
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
 - `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
-- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+- `state_dict`: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
 - `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 
 `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 0ee0df6697..a0221ff9e1 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -37,7 +37,7 @@ bert_docstring = """
                  checkpoint
         cache_dir: an optional path to a folder in which the pre-trained models
                    will be cached.
-        state_dict: an optional state dictionnary
+        state_dict: an optional state dictionary
                     (collections.OrderedDict object) to use instead of Google
                     pre-trained models
         *inputs, **kwargs: additional input for the specific Bert class
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index 1683c881fa..c58c1fa708 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -40,7 +40,7 @@ gpt_docstring = """
 				. a series of NumPy files containing OpenAI TensorFlow trained weights
 		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
 		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionnary (collections.OrderedDict object)
+		state_dict: an optional state dictionary (collections.OrderedDict object)
 		        	to use instead of pre-trained models
 		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
 """
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index d89db894ad..cfcc6aef5a 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -23,7 +23,7 @@ transformer_xl_docstring = """
                 . `model.chkpt` a TensorFlow checkpoint
         from_tf: should we load the weights from a locally saved TensorFlow checkpoint
         cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
         *inputs, **kwargs: additional input for the specific TransformerXL class
 """
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 66bfe99d85..4fabd49baf 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -358,7 +358,7 @@ class PreTrainedModel(nn.Module):
                 Dictionary of key, values to update the configuration object after loading.
                 Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
 
-               - If a configuration is provided with `config`, **kwargs will be directly passed
+               - If a configuration is providedictionaryfig`, **kwargs will be directly passed
                  to the underlying model's __init__ method.
                - If a configuration is not provided, **kwargs will be first passed to the pretrained
                  model configuration class loading function (`PretrainedConfig.from_pretrained`).
@@ -367,7 +367,7 @@ class PreTrainedModel(nn.Module):
                  Remaining keys that do not correspond to any configuration attribute will
                  be passed to the underlying model's __init__ function.
 
-        Examples::
+        Examples::dictionary
 
             >>> model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
             >>> model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 2b3219c4cc..eaef2fed1e 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -37,7 +37,7 @@ class PreTrainedTokenizer(object):
             additional_special_tokens = []
 
         We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
-            specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
+            specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
     """
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
@@ -324,7 +324,7 @@ class PreTrainedTokenizer(object):
 
 
     def add_special_tokens(self, special_tokens_dict):
-        """ Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
+        """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
             to class attributes. If the special tokens are not in the vocabulary, they are added
             to it and indexed starting from the last index of the current vocabulary.