From 35401fe50fa3e460b2a4422630b017f106c79e03 Mon Sep 17 00:00:00 2001
From: Aymeric Augustin <aymeric.augustin@fractalideas.com>
Date: Fri, 6 Dec 2019 19:57:38 +0100
Subject: [PATCH] Remove dependency on pytest for running tests (#2055)

* Switch to plain unittest for skipping slow tests.

Add a RUN_SLOW environment variable for running them.

* Switch to plain unittest for PyTorch dependency.

* Switch to plain unittest for TensorFlow dependency.

* Avoid leaking open files in the test suite.

This prevents spurious warnings when running tests.

* Fix unicode warning on Python 2 when running tests.

The warning was:

    UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal

* Support running PyTorch tests on a GPU.

Reverts 27e015bd.

* Tests no longer require pytest.

* Make tests pass on cuda
---
 README.md                                     | 11 +++-
 docs/source/installation.md                   | 11 +++-
 setup.py                                      |  1 -
 .../tests/modeling_tf_xxx_test.py             |  7 +-
 .../tests/modeling_xxx_test.py                | 12 ++--
 transformers/modeling_openai.py               |  6 +-
 transformers/tests/conftest.py                | 31 ---------
 transformers/tests/modeling_albert_test.py    | 11 ++--
 transformers/tests/modeling_auto_test.py      | 14 ++--
 transformers/tests/modeling_bert_test.py      | 38 +++++------
 transformers/tests/modeling_common_test.py    | 43 ++++++++++---
 transformers/tests/modeling_ctrl_test.py      |  9 +--
 .../tests/modeling_distilbert_test.py         | 12 ++--
 .../tests/modeling_encoder_decoder_test.py    |  7 +-
 transformers/tests/modeling_gpt2_test.py      | 10 +--
 transformers/tests/modeling_openai_test.py    | 10 +--
 transformers/tests/modeling_roberta_test.py   | 22 ++++---
 transformers/tests/modeling_tf_albert_test.py |  7 +-
 transformers/tests/modeling_tf_auto_test.py   | 14 ++--
 transformers/tests/modeling_tf_bert_test.py   |  7 +-
 transformers/tests/modeling_tf_common_test.py |  8 +--
 transformers/tests/modeling_tf_ctrl_test.py   |  7 +-
 .../tests/modeling_tf_distilbert_test.py      |  7 +-
 transformers/tests/modeling_tf_gpt2_test.py   |  7 +-
 .../tests/modeling_tf_openai_gpt_test.py      |  7 +-
 .../tests/modeling_tf_roberta_test.py         | 19 +++---
 .../tests/modeling_tf_transfo_xl_test.py      |  7 +-
 transformers/tests/modeling_tf_xlm_test.py    |  7 +-
 transformers/tests/modeling_tf_xlnet_test.py  | 10 +--
 .../tests/modeling_transfo_xl_test.py         | 10 +--
 transformers/tests/modeling_xlm_test.py       | 12 ++--
 transformers/tests/modeling_xlnet_test.py     | 21 ++++--
 transformers/tests/optimization_test.py       |  6 +-
 transformers/tests/tokenization_auto_test.py  |  5 +-
 transformers/tests/tokenization_bert_test.py  |  4 +-
 .../tests/tokenization_distilbert_test.py     |  4 +-
 .../tests/tokenization_roberta_test.py        |  4 +-
 .../tests/tokenization_tests_commons.py       |  6 +-
 .../tests/tokenization_transfo_xl_test.py     |  6 +-
 transformers/tests/tokenization_utils_test.py |  6 +-
 transformers/tests/tokenization_xlm_test.py   |  4 +-
 transformers/tests/tokenization_xlnet_test.py |  4 +-
 transformers/tests/utils.py                   | 64 +++++++++++++++++++
 transformers/tokenization_albert.py           |  8 +--
 transformers/tokenization_ctrl.py             |  6 +-
 transformers/tokenization_gpt2.py             | 12 ++--
 transformers/tokenization_openai.py           |  6 +-
 transformers/tokenization_utils.py            | 13 ++--
 transformers/tokenization_xlm.py              |  8 ++-
 transformers/tokenization_xlnet.py            |  4 +-
 50 files changed, 344 insertions(+), 231 deletions(-)
 delete mode 100644 transformers/tests/conftest.py
 create mode 100644 transformers/tests/utils.py

diff --git a/README.md b/README.md
index ddeabe08d6..64ec631651 100644
--- a/README.md
+++ b/README.md
@@ -101,17 +101,26 @@ pip install [--editable] .
 
 A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 
-These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+These tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
 
 Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
 
 You can run the tests from the root of the cloned repository with the commands:
 
+```bash
+python -m unittest discover -s transformers/tests -p "*test.py" -t .
+python -m unittest discover -s examples -p "*test.py" -t examples
+```
+
+or
+
 ```bash
 python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
+
 ### Do you want to run a Transformer model on a mobile device?
 
 You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 11beb1ab3a..6263f7604d 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -24,15 +24,24 @@ pip install [--editable] .
 
 An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 
-Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+Tests can be run using `unittest` or `pytest` (install pytest if needed with `pip install pytest`).
 
 Run all the tests from the root of the cloned repository with the commands:
 
+```bash
+python -m unittest discover -s transformers/tests -p "*test.py" -t .
+python -m unittest discover -s examples -p "*test.py" -t examples
+```
+
+or
+
 ``` bash
 python -m pytest -sv ./transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `yes` to run them.
+
 ## OpenAI GPT original tokenization workflow
 
 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
diff --git a/setup.py b/setup.py
index 25f503f8d0..c4af32df83 100644
--- a/setup.py
+++ b/setup.py
@@ -72,7 +72,6 @@ setup(
         'transformers-cli'
     ],
     # python_requires='>=3.5.0',
-    tests_require=['pytest'],
     classifiers=[
           'Intended Audience :: Science/Research',
           'License :: OSI Approved :: Apache Software License',
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index 90837ca1ea..d7e576bf8b 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import XxxConfig, is_tf_available
 
@@ -33,10 +33,9 @@ if is_tf_available():
                                                TFXxxForTokenClassification,
                                                TFXxxForQuestionAnswering,
                                                TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
@@ -244,7 +243,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in ['xxx-base-uncased']:
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index 8c0cc3cf32..bfc70921cd 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -18,12 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
@@ -31,10 +31,9 @@ if is_torch_available():
                                         XxxForQuestionAnswering, XxxForSequenceClassification,
                                         XxxForTokenClassification, XxxForMultipleChoice)
     from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class XxxModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
@@ -131,6 +130,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = XxxModel(config=config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -148,6 +148,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = XxxForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -162,6 +163,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = XxxForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                    start_positions=sequence_labels, end_positions=sequence_labels)
@@ -182,6 +184,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = XxxForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
             result = {
@@ -197,6 +200,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = XxxForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
             result = {
@@ -243,7 +247,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index e88f55c3ea..4fe7ffee8b 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -50,8 +50,10 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
 
     logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
 
-    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
-    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
+    with open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8') as names_handle:
+        names = json.load(names_handle)
+    with open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8') as shapes_handle:
+        shapes = json.load(shapes_handle)
     offsets = np.cumsum([np.prod(shape) for shape in shapes])
     init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
     init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
diff --git a/transformers/tests/conftest.py b/transformers/tests/conftest.py
deleted file mode 100644
index f809234cd5..0000000000
--- a/transformers/tests/conftest.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# content of conftest.py
-
-import pytest
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--runslow", action="store_true", default=False, help="run slow tests"
-    )
-    parser.addoption(
-        "--use_cuda", action="store_true", default=False, help="run tests on gpu"
-    )
-
-
-def pytest_configure(config):
-    config.addinivalue_line("markers", "slow: mark test as slow to run")
-
-
-def pytest_collection_modifyitems(config, items):
-    if config.getoption("--runslow"):
-        # --runslow given in cli: do not skip slow tests
-        return
-    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
-    for item in items:
-        if "slow" in item.keywords:
-            item.add_marker(skip_slow)
-
-@pytest.fixture
-def use_cuda(request):
-    """ Run test on gpu """
-    return request.config.getoption("--use_cuda")
diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py
index 976feff9db..a14d66ae8f 100644
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -18,22 +18,21 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
                               AlbertForSequenceClassification, AlbertForQuestionAnswering,
                               )
     from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class AlbertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
@@ -133,6 +132,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = AlbertModel(config=config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -150,6 +150,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = AlbertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -163,6 +164,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = AlbertForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                    start_positions=sequence_labels, end_positions=sequence_labels)
@@ -183,6 +185,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = AlbertForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
             result = {
@@ -225,7 +228,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 6d2c7ec979..9b7d920bc8 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import logging
 
 from transformers import is_torch_available
 
+from .utils import require_torch, slow
+
 if is_torch_available():
     from transformers import (AutoConfig, BertConfig,
                                     AutoModel, BertModel,
@@ -33,12 +34,11 @@ if is_torch_available():
 
     from .modeling_common_test import (CommonTestCases, ids_tensor)
     from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class AutoModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
             for value in loading_info.values():
                 self.assertEqual(len(value), 0)
 
-    @pytest.mark.slow
+    @slow
     def test_lmhead_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForMaskedLM)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_classification_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForSequenceClassification)
 
-    @pytest.mark.slow
+    @slow
     def test_question_answering_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 6c93c9a187..539f66cd3f 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -18,12 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
 from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (BertConfig, BertModel, BertForMaskedLM,
@@ -31,11 +31,9 @@ if is_torch_available():
                               BertForQuestionAnswering, BertForSequenceClassification,
                               BertForTokenClassification, BertForMultipleChoice)
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
-@pytest.mark.usefixtures("use_cuda")
+@require_torch
 class BertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
@@ -67,7 +65,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                      num_labels=3,
                      num_choices=4,
                      scope=None,
-                     device='cpu',
                      ):
             self.parent = parent
             self.batch_size = batch_size
@@ -91,26 +88,25 @@ class BertModelTest(CommonTestCases.CommonModelTester):
             self.num_labels = num_labels
             self.num_choices = num_choices
             self.scope = scope
-            self.device = device
 
         def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).to(self.device)
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             input_mask = None
             if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2).to(self.device)
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 
             token_type_ids = None
             if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size).to(self.device)
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
             sequence_labels = None
             token_labels = None
             choice_labels = None
             if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size).to(self.device)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels).to(self.device)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices).to(self.device)
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -144,7 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertModel(config=config)
-            model.to(input_ids.device)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -161,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
             model = BertModel(config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
@@ -177,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -190,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
             model = BertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
@@ -204,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForNextSentencePrediction(config=config)
+            model.to(torch_device)
             model.eval()
             loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
             result = {
@@ -217,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForPreTraining(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                                     masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
@@ -235,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                    start_positions=sequence_labels, end_positions=sequence_labels)
@@ -254,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = BertForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
             result = {
@@ -268,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = BertForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
             result = {
@@ -282,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_choices = self.num_choices
             model = BertForMultipleChoice(config=config)
+            model.to(torch_device)
             model.eval()
             multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
@@ -313,10 +318,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
     def test_config(self):
         self.config_tester.run_common_tests()
 
-    def test_bert_model(self, use_cuda=False):
-        # ^^ This could be a real fixture
-        if use_cuda:
-            self.model_tester.device = "cuda"
+    def test_bert_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_model(*config_and_inputs)
 
@@ -356,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index baf1531403..80d5d95455 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -27,10 +27,11 @@ import uuid
 
 import unittest
 import logging
-import pytest
 
 from transformers import is_torch_available
 
+from .utils import require_torch, slow, torch_device
+
 if is_torch_available():
     import torch
     import numpy as np
@@ -38,8 +39,6 @@ if is_torch_available():
     from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
                                     BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -65,6 +64,7 @@ def _config_zero_init(config):
 
 class CommonTestCases:
 
+    @require_torch
     class CommonModelTester(unittest.TestCase):
 
         model_tester = None
@@ -79,6 +79,7 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 with torch.no_grad():
                     outputs = model(**inputs_dict)
@@ -86,12 +87,13 @@ class CommonTestCases:
                 with TemporaryDirectory() as tmpdirname:
                     model.save_pretrained(tmpdirname)
                     model = model_class.from_pretrained(tmpdirname)
+                    model.to(torch_device)
                     with torch.no_grad():
                         after_outputs = model(**inputs_dict)
 
                     # Make sure we don't have nans
-                    out_1 = after_outputs[0].numpy()
-                    out_2 = outputs[0].numpy()
+                    out_1 = after_outputs[0].cpu().numpy()
+                    out_2 = outputs[0].cpu().numpy()
                     out_1 = out_1[~np.isnan(out_1)]
                     out_2 = out_2[~np.isnan(out_2)]
                     max_diff = np.amax(np.abs(out_1 - out_2))
@@ -113,6 +115,7 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
                 self.assertEqual(first.ne(second).sum().item(), 0)
@@ -125,6 +128,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(**inputs_dict)
                 attentions = outputs[-1]
@@ -142,6 +146,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = True
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(**inputs_dict)
                 self.assertEqual(out_len+1, len(outputs))
@@ -181,6 +186,7 @@ class CommonTestCases:
             configs_no_init.torchscript = True
             for model_class in self.all_model_classes:
                 model = model_class(config=configs_no_init)
+                model.to(torch_device)
                 model.eval()
                 inputs = inputs_dict['input_ids']  # Let's keep only input_ids
 
@@ -201,7 +207,10 @@ class CommonTestCases:
                 except ValueError:
                     self.fail("Couldn't load module.")
 
+                model.to(torch_device)
                 model.eval()
+
+                loaded_model.to(torch_device)
                 loaded_model.eval()
 
                 model_params = model.parameters()
@@ -228,11 +237,12 @@ class CommonTestCases:
             configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
             for model_class in self.all_model_classes:
                 model = model_class(config=configs_no_init)
+                model.to(torch_device)
                 model.eval()
 
                 # Prepare head_mask
                 # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
                 head_mask[0, 0] = 0
                 head_mask[-1, :-1] = 0
                 head_mask.requires_grad_(requires_grad=True)
@@ -282,6 +292,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
                 heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                 -1: [0]}
@@ -310,6 +321,7 @@ class CommonTestCases:
                 config.output_attentions = True
                 config.output_hidden_states = False
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
                 heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                 -1: [0]}
@@ -319,6 +331,7 @@ class CommonTestCases:
                     os.makedirs(directory)
                 model.save_pretrained(directory)
                 model = model_class.from_pretrained(directory)
+                model.to(torch_device)
 
                 outputs = model(**inputs_dict)
                 attentions = outputs[-1]
@@ -346,6 +359,7 @@ class CommonTestCases:
                 config.pruned_heads = heads_to_prune
 
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
 
                 outputs = model(**inputs_dict)
@@ -372,6 +386,7 @@ class CommonTestCases:
                 config.pruned_heads = heads_to_prune
 
                 model = model_class(config=config)
+                model.to(torch_device)
                 model.eval()
 
                 outputs = model(**inputs_dict)
@@ -388,6 +403,7 @@ class CommonTestCases:
                     os.makedirs(directory)
                 model.save_pretrained(directory)
                 model = model_class.from_pretrained(directory)
+                model.to(torch_device)
                 shutil.rmtree(directory)
 
                 outputs = model(**inputs_dict)
@@ -419,6 +435,7 @@ class CommonTestCases:
                 config.output_hidden_states = True
                 config.output_attentions = False
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(**inputs_dict)
                 hidden_states = outputs[-1]
@@ -538,6 +555,7 @@ class CommonTestCases:
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
 
                 wte = model.get_input_embeddings()
@@ -628,6 +646,7 @@ class CommonTestCases:
         def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
                                 mc_labels, lm_labels, mc_token_ids):
             model = self.base_model_class(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids, position_ids, token_type_ids)
@@ -643,6 +662,7 @@ class CommonTestCases:
         def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                         mc_labels, lm_labels, mc_token_ids):
             model = self.lm_head_model_class(config)
+            model.to(torch_device)
             model.eval()
             outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
             loss, lm_logits = outputs[:2]
@@ -659,6 +679,7 @@ class CommonTestCases:
                                         mc_labels, lm_labels, mc_token_ids):
             for model_class in self.all_model_classes:
                 model = model_class(config)
+                model.to(torch_device)
                 model.eval()
                 outputs = model(input_ids)
                 presents = outputs[-1]
@@ -671,6 +692,7 @@ class CommonTestCases:
         def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
                                         mc_labels, lm_labels, mc_token_ids):
             model = self.double_head_model_class(config)
+            model.to(torch_device)
             model.eval()
             outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                             token_type_ids=token_type_ids, position_ids=position_ids)
@@ -716,7 +738,7 @@ class CommonTestCases:
                 config_and_inputs = self.prepare_config_and_inputs()
                 self.create_and_check_presents(*config_and_inputs)
 
-        @pytest.mark.slow
+        @slow
         def run_slow_tests(self):
             self.create_and_check_model_from_pretrained()
 
@@ -770,7 +792,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
     for _ in range(total_dims):
         values.append(rng.randint(0, vocab_size - 1))
 
-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
 
 
 def floats_tensor(shape, scale=1.0, rng=None, name=None):
@@ -786,11 +808,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
     for _ in range(total_dims):
         values.append(rng.random() * scale)
 
-    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
 
 
+@require_torch
 class ModelUtilsTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index 47ff8d8d51..8c14578a5c 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -16,7 +16,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 import shutil
 import pdb
 
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
     from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     CTRLLMHeadModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class CTRLModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = CTRLModel(config=config)
+            model.to(torch_device)
             model.eval()
 
             model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = CTRLLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index 8099c03586..4b8f64327d 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 
 from transformers import is_torch_available
 
@@ -25,13 +24,13 @@ if is_torch_available():
     from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
                                     DistilBertForTokenClassification,
                                     DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
@@ -126,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = DistilBertModel(config=config)
+            model.to(torch_device)
             model.eval()
             (sequence_output,) = model(input_ids, input_mask)
             (sequence_output,) = model(input_ids)
@@ -139,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = DistilBertForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
             result = {
@@ -152,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = DistilBertForQuestionAnswering(config=config)
+            model.to(torch_device)
             model.eval()
             loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
             result = {
@@ -170,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = DistilBertForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
             result = {
@@ -184,6 +187,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = DistilBertForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
 
             loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
@@ -229,7 +233,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
 
-    # @pytest.mark.slow
+    # @slow
     # def test_model_from_pretrained(self):
     #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_encoder_decoder_test.py b/transformers/tests/modeling_encoder_decoder_test.py
index a6c88ed9a9..64e86df8f5 100644
--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -15,19 +15,18 @@
 
 import logging
 import unittest
-import pytest
 
 from transformers import is_torch_available
+from .utils import require_torch, slow
 
 if is_torch_available():
     from transformers import BertModel, BertForMaskedLM, Model2Model
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class EncoderDecoderModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model2model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
index 4263e51bc9..ecaa2a4bd0 100644
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 import shutil
 
 from transformers import is_torch_available
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
     from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2Model(config=config)
+            model.to(torch_device)
             model.eval()
 
             model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
             model = GPT2LMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
             model = GPT2DoubleHeadsModel(config)
+            model.to(torch_device)
             model.eval()
 
 
@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
index 33218288a0..8e4d13438d 100644
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 import shutil
 
 from transformers import is_torch_available
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
     from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTModel(config=config)
+            model.to(torch_device)
             model.eval()
 
             model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
             model = OpenAIGPTDoubleHeadsModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 0620ddf630..7a3553b164 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -27,13 +26,13 @@ if is_torch_available():
     from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
                               RobertaForSequenceClassification, RobertaForTokenClassification)
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class RobertaModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
@@ -129,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                            token_labels, choice_labels):
             model = RobertaModel(config=config)
+            model.to(torch_device)
             model.eval()
             sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
             sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -146,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                                    token_labels, choice_labels):
             model = RobertaForMaskedLM(config=config)
+            model.to(torch_device)
             model.eval()
             loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
             result = {
@@ -161,6 +162,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                                                               sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = RobertaForTokenClassification(config=config)
+            model.to(torch_device)
             model.eval()
             loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                  labels=token_labels)
@@ -195,7 +197,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -207,10 +209,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
 
 class RobertaModelIntegrationTest(unittest.TestCase):
 
-    @pytest.mark.slow
+    @slow
     def test_inference_masked_lm(self):
         model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        
+
         input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 11, 50265))
@@ -228,10 +230,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
             torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_no_head(self):
         model = RobertaModel.from_pretrained('roberta-base')
-        
+
         input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
@@ -244,10 +246,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
             torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_classification_head(self):
         model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
-        
+
         input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = torch.Size((1, 3))
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index fbd519b8f6..7d3325b70b 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import AlbertConfig, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
                                                  TFAlbertForSequenceClassification,
                                                  TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (
@@ -216,7 +215,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
         self.model_tester.create_and_check_albert_for_sequence_classification(
             *config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         # for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index fa90906e86..7ea48015d9 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import logging
 
 from transformers import is_tf_available
 
+from .utils import require_tf, slow
+
 if is_tf_available():
     from transformers import (AutoConfig, BertConfig,
                                       TFAutoModel, TFBertModel,
@@ -33,12 +34,11 @@ if is_tf_available():
 
     from .modeling_common_test import (CommonTestCases, ids_tensor)
     from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFAutoModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         import h5py
         self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
@@ -54,7 +54,7 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertModel)
 
-    @pytest.mark.slow
+    @slow
     def test_lmhead_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -67,7 +67,7 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForMaskedLM)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_classification_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -80,7 +80,7 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForSequenceClassification)
 
-    @pytest.mark.slow
+    @slow
     def test_question_answering_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index bcee97435e..d7a86fecb9 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import BertConfig, is_tf_available
 
@@ -36,10 +36,9 @@ if is_tf_available():
                                                        TFBertForTokenClassification,
                                                        TFBertForQuestionAnswering,
                                                        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
@@ -309,7 +308,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 7445ce826a..439360ba35 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -25,18 +25,17 @@ import unittest
 import uuid
 import tempfile
 
-import pytest
 import sys
 
 from transformers import is_tf_available, is_torch_available
 
+from .utils import require_tf, slow
+
 if is_tf_available():
     import tensorflow as tf
     import numpy as np
     from transformers import TFPreTrainedModel
     # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -62,6 +61,7 @@ def _config_zero_init(config):
 
 class TFCommonTestCases:
 
+    @require_tf
     class TFCommonModelTester(unittest.TestCase):
 
         model_tester = None
@@ -164,7 +164,7 @@ class TFCommonTestCases:
             for model_class in self.all_model_classes:
                 # Prepare our model
                 model = model_class(config)
-                
+
                 # Let's load it from the disk to be sure we can use pretrained weights
                 with TemporaryDirectory() as tmpdirname:
                     outputs = model(inputs_dict)  # build the model
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index a57c882169..0b421c20c9 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import CTRLConfig, is_tf_available
 
@@ -30,10 +30,9 @@ if is_tf_available():
     import tensorflow as tf
     from transformers.modeling_tf_ctrl import (TFCTRLModel, TFCTRLLMHeadModel,
                                                 TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel) if is_tf_available() else ()
@@ -188,7 +187,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_ctrl_lm_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index e6d3795914..0ec45150ca 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -17,10 +17,10 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-import pytest
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import DistilBertConfig, is_tf_available
 
@@ -30,10 +30,9 @@ if is_tf_available():
                                                              TFDistilBertForMaskedLM,
                                                              TFDistilBertForQuestionAnswering,
                                                              TFDistilBertForSequenceClassification)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFDistilBertModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering,
@@ -210,7 +209,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
 
-    # @pytest.mark.slow
+    # @slow
     # def test_model_from_pretrained(self):
     #     cache_dir = "/tmp/transformers_test/"
     #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index 76e9ee2298..e070b72e65 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import GPT2Config, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_gpt2 import (TFGPT2Model, TFGPT2LMHeadModel,
                                                        TFGPT2DoubleHeadsModel,
                                                        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFGPT2Model, TFGPT2LMHeadModel,
@@ -219,7 +218,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_gpt2_double_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index d470c8862d..675e806c12 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -18,11 +18,11 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import OpenAIGPTConfig, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_openai import (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
                                                          TFOpenAIGPTDoubleHeadsModel,
                                                          TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel,
@@ -218,7 +217,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_openai_gpt_double_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index edbfa4e205..42440bf1b7 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -18,10 +18,10 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import RobertaConfig, is_tf_available
 
@@ -32,10 +32,9 @@ if is_tf_available():
                                                           TFRobertaForSequenceClassification,
                                                           TFRobertaForTokenClassification,
                                                           TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFRobertaModel,TFRobertaForMaskedLM,
@@ -191,7 +190,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -203,10 +202,10 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
 
 class TFRobertaModelIntegrationTest(unittest.TestCase):
 
-    @pytest.mark.slow
+    @slow
     def test_inference_masked_lm(self):
         model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
-        
+
         input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = [1, 11, 50265]
@@ -224,10 +223,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
             numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_no_head(self):
         model = TFRobertaModel.from_pretrained('roberta-base')
-        
+
         input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         # compare the actual values for a slice.
@@ -240,10 +239,10 @@ class TFRobertaModelIntegrationTest(unittest.TestCase):
             numpy.allclose(output[:, :3, :3].numpy(), expected_slice.numpy(), atol=1e-3)
         )
 
-    @pytest.mark.slow
+    @slow
     def test_inference_classification_head(self):
         model = TFRobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
-        
+
         input_ids = tf.constant([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
         output = model(input_ids)[0]
         expected_shape = [1, 3]
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 534fe39646..03e332bdc1 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -19,10 +19,10 @@ from __future__ import print_function
 import unittest
 import random
 import shutil
-import pytest
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import TransfoXLConfig, is_tf_available
 
@@ -31,10 +31,9 @@ if is_tf_available():
     from transformers.modeling_tf_transfo_xl import (TFTransfoXLModel,
                                                              TFTransfoXLLMHeadModel,
                                                              TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 
+@require_tf
 class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFTransfoXLModel, TFTransfoXLLMHeadModel) if is_tf_available() else ()
@@ -204,7 +203,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
index 1bd661bebf..a680b70367 100644
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_tf_available
 
@@ -29,13 +28,13 @@ if is_tf_available():
                                       TFXLMForSequenceClassification,
                                       TFXLMForQuestionAnsweringSimple,
                                       TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 
+@require_tf
 class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel,
@@ -251,7 +250,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index a00a965570..94864b86f2 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -21,7 +21,6 @@ import unittest
 import json
 import random
 import shutil
-import pytest
 
 from transformers import XLNetConfig, is_tf_available
 
@@ -33,12 +32,13 @@ if is_tf_available():
                                                         TFXLNetForTokenClassification,
                                                         TFXLNetForQuestionAnsweringSimple,
                                                         TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
+
+@require_tf
 class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
 
     all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel,
@@ -304,7 +304,7 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
     def test_xlnet_lm_head(self):
         self.model_tester.set_seed()
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
 
     def test_xlnet_sequence_classif(self):
         self.model_tester.set_seed()
@@ -320,7 +320,7 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index f7b913da5b..647dd3724d 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 import unittest
 import random
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -27,12 +26,13 @@ if is_torch_available():
     import torch
     from transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
     from transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
+
+@require_torch
 class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel) if is_torch_available() else ()
@@ -111,6 +111,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
         def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TransfoXLModel(config)
+            model.to(torch_device)
             model.eval()
 
             hidden_states_1, mems_1 = model(input_ids_1)
@@ -140,6 +141,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
 
         def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
             model = TransfoXLLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             lm_logits_1, mems_1 = model(input_ids_1)
@@ -204,7 +206,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
         output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
         self.model_tester.check_transfo_xl_lm_head_output(output_result)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
index 0133febb58..f6b980767c 100644
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -26,13 +25,13 @@ if is_torch_available():
     from transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
                                       XLMForSequenceClassification, XLMForQuestionAnsweringSimple)
     from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 
+@require_torch
 class XLMModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering,
@@ -148,6 +147,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMModel(config=config)
+            model.to(torch_device)
             model.eval()
             outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids)
             outputs = model(input_ids, langs=token_type_ids)
@@ -163,6 +163,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMWithLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss, logits = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
@@ -182,6 +183,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_simple_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMForQuestionAnsweringSimple(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids)
@@ -206,6 +208,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMForQuestionAnswering(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids)
@@ -260,6 +263,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
             model = XLMForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
 
             (logits,) = model(input_ids)
@@ -312,7 +316,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index 38888d4488..56b6bb3f4d 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -21,7 +21,6 @@ import unittest
 import json
 import random
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
@@ -31,12 +30,13 @@ if is_torch_available():
     from transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification,
                               XLNetForTokenClassification, XLNetForQuestionAnswering)
     from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
+
+@require_torch
 class XLNetModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes=(XLNetModel, XLNetLMHeadModel, XLNetForTokenClassification,
@@ -100,9 +100,9 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             input_mask = ids_tensor([self.batch_size, self.seq_length], 2).float()
 
             input_ids_q = ids_tensor([self.batch_size, self.seq_length + 1], self.vocab_size)
-            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float)
+            perm_mask = torch.zeros(self.batch_size, self.seq_length + 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
             perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float)
+            target_mapping = torch.zeros(self.batch_size, 1, self.seq_length + 1, dtype=torch.float, device=torch_device)
             target_mapping[:, 0, -1] = 1.0  # predict last token
 
             sequence_labels = None
@@ -141,6 +141,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_base_model(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetModel(config)
+            model.to(torch_device)
             model.eval()
 
             _, _ = model(input_ids_1, input_mask=input_mask)
@@ -155,6 +156,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
 
             config.mem_len = 0
             model = XLNetModel(config)
+            model.to(torch_device)
             model.eval()
             no_mems_outputs = model(input_ids_1)
             self.parent.assertEqual(len(no_mems_outputs), 1)
@@ -169,6 +171,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_base_model_with_att_output(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                     target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetModel(config)
+            model.to(torch_device)
             model.eval()
 
             _, _, attentions = model(input_ids_1, target_mapping=target_mapping)
@@ -181,6 +184,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetLMHeadModel(config)
+            model.to(torch_device)
             model.eval()
 
             loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels)
@@ -221,6 +225,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetForQuestionAnswering(config)
+            model.to(torch_device)
             model.eval()
 
             outputs = model(input_ids_1)
@@ -279,6 +284,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_token_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetForTokenClassification(config)
+            model.to(torch_device)
             model.eval()
 
             logits, mems_1 = model(input_ids_1)
@@ -311,6 +317,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
                 target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels):
             model = XLNetForSequenceClassification(config)
+            model.to(torch_device)
             model.eval()
 
             logits, mems_1 = model(input_ids_1)
@@ -362,7 +369,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
     def test_xlnet_lm_head(self):
         self.model_tester.set_seed()
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
 
     def test_xlnet_sequence_classif(self):
         self.model_tester.set_seed()
@@ -379,7 +386,7 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py
index ab9afbfcf7..cc10ad5908 100644
--- a/transformers/tests/optimization_test.py
+++ b/transformers/tests/optimization_test.py
@@ -18,7 +18,6 @@ from __future__ import print_function
 
 import unittest
 import os
-import pytest
 
 from transformers import is_torch_available
 
@@ -31,10 +30,9 @@ if is_torch_available():
                               get_cosine_schedule_with_warmup,
                               get_cosine_with_hard_restarts_schedule_with_warmup,
                               get_linear_schedule_with_warmup)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 from .tokenization_tests_commons import TemporaryDirectory
+from .utils import require_torch
 
 
 def unwrap_schedule(scheduler, num_steps=10):
@@ -58,6 +56,7 @@ def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
                 scheduler.load_state_dict(state_dict)
     return lrs
 
+@require_torch
 class OptimizationTest(unittest.TestCase):
 
     def assertListAlmostEqual(self, list1, list2, tol):
@@ -80,6 +79,7 @@ class OptimizationTest(unittest.TestCase):
         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 
 
+@require_torch
 class ScheduleInitTest(unittest.TestCase):
     m = torch.nn.Linear(50, 50) if is_torch_available() else None
     optimizer = AdamW(m.parameters(), lr=10.) if is_torch_available() else None
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 79370811e8..18346d2768 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -18,15 +18,16 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import logging
 
 from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
+from .utils import slow
+
 
 class AutoTokenizerTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
     def test_tokenizer_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
         for model_name in list(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index 73ea38e20a..f390248956 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_bert import (BasicTokenizer,
@@ -26,6 +25,7 @@ from transformers.tokenization_bert import (BasicTokenizer,
                                                     _is_whitespace, VOCAB_FILES_NAMES)
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertFalse(_is_punctuation(u"A"))
         self.assertFalse(_is_punctuation(u" "))
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
 
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index 77a487651d..e815eca672 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -16,13 +16,13 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_distilbert import (DistilBertTokenizer)
 
 from .tokenization_tests_commons import CommonTestCases
 from .tokenization_bert_test import BertTokenizationTest
+from .utils import slow
 
 class DistilBertTokenizationTest(BertTokenizationTest):
 
@@ -31,7 +31,7 @@ class DistilBertTokenizationTest(BertTokenizationTest):
     def get_tokenizer(self, **kwargs):
         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 
diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
index a27bf7d654..8ad0b59511 100644
--- a/transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -17,11 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import json
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 
 class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
@@ -79,7 +79,7 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
         )
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
 
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index 97cd555df3..faff003f4b 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -102,9 +102,11 @@ class CommonTestCases:
             with TemporaryDirectory() as tmpdirname:
 
                 filename = os.path.join(tmpdirname, u"tokenizer.bin")
-                pickle.dump(tokenizer, open(filename, "wb"))
+                with open(filename, "wb") as handle:
+                    pickle.dump(tokenizer, handle)
 
-                tokenizer_new = pickle.load(open(filename, "rb"))
+                with open(filename, "rb") as handle:
+                    tokenizer_new = pickle.load(handle)
 
             subwords_loaded = tokenizer_new.tokenize(text)
 
diff --git a/transformers/tests/tokenization_transfo_xl_test.py b/transformers/tests/tokenization_transfo_xl_test.py
index 4e99484b0c..5495ebd3a6 100644
--- a/transformers/tests/tokenization_transfo_xl_test.py
+++ b/transformers/tests/tokenization_transfo_xl_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers import is_torch_available
@@ -24,11 +23,12 @@ from transformers import is_torch_available
 if is_torch_available():
     import torch
     from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
-else:
-    pytestmark = pytest.mark.skip("Require Torch")  # TODO: untangle Transfo-XL tokenizer from torch.load and torch.save
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import require_torch
 
+
+@require_torch
 class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
diff --git a/transformers/tests/tokenization_utils_test.py b/transformers/tests/tokenization_utils_test.py
index 8630191c69..ff3f80ff7d 100644
--- a/transformers/tests/tokenization_utils_test.py
+++ b/transformers/tests/tokenization_utils_test.py
@@ -18,13 +18,14 @@ from __future__ import print_function
 
 import unittest
 import six
-import pytest
 
 from transformers import PreTrainedTokenizer
 from transformers.tokenization_gpt2 import GPT2Tokenizer
 
+from .utils import slow
+
 class TokenizerUtilsTest(unittest.TestCase):
-    @pytest.mark.slow
+
     def check_tokenizer_from_pretrained(self, tokenizer_class):
         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
         for model_name in s3_models[:1]:
@@ -41,6 +42,7 @@ class TokenizerUtilsTest(unittest.TestCase):
                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
                 self.assertIsInstance(special_tok_id, int)
 
+    @slow
     def test_pretrained_tokenizers(self):
         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
 
diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
index 3ff6564e34..7582a46662 100644
--- a/transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -17,11 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import os
 import unittest
 import json
-import pytest
 
 from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
@@ -67,7 +67,7 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
 
diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
index 2e14ffeb82..b68495a796 100644
--- a/transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -16,11 +16,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 
 from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow
 
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'fixtures/test_sentencepiece.model')
@@ -90,7 +90,7 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
new file mode 100644
index 0000000000..7a51ab612b
--- /dev/null
+++ b/transformers/tests/utils.py
@@ -0,0 +1,64 @@
+import os
+import unittest
+
+from distutils.util import strtobool
+
+from transformers.file_utils import _tf_available, _torch_available
+
+
+try:
+    run_slow = os.environ["RUN_SLOW"]
+except KeyError:
+    # RUN_SLOW isn't set, default to skipping slow tests.
+    _run_slow_tests = False
+else:
+    # RUN_SLOW is set, convert it to True or False.
+    try:
+        _run_slow_tests = strtobool(run_slow)
+    except ValueError:
+        # More values are supported, but let's keep the message simple.
+        raise ValueError("If set, RUN_SLOW must be yes or no.")
+
+
+def slow(test_case):
+    """
+    Decorator marking a test as slow.
+
+    Slow tests are skipped by default. Set the RUN_SLOW environment variable
+    to a truthy value to run them.
+
+    """
+    if not _run_slow_tests:
+        test_case = unittest.skip("test is slow")(test_case)
+    return test_case
+
+
+def require_torch(test_case):
+    """
+    Decorator marking a test that requires PyTorch.
+
+    These tests are skipped when PyTorch isn't installed.
+
+    """
+    if not _torch_available:
+        test_case = unittest.skip("test requires PyTorch")(test_case)
+    return test_case
+
+
+def require_tf(test_case):
+    """
+    Decorator marking a test that requires TensorFlow.
+
+    These tests are skipped when TensorFlow isn't installed.
+
+    """
+    if not _tf_available:
+        test_case = unittest.skip("test requires TensorFlow")(test_case)
+    return test_case
+
+
+if _torch_available:
+    # Set the USE_CUDA environment variable to select a GPU.
+    torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
+else:
+    torch_device = None
diff --git a/transformers/tokenization_albert.py b/transformers/tokenization_albert.py
index 40a4b29206..6b92d07218 100644
--- a/transformers/tokenization_albert.py
+++ b/transformers/tokenization_albert.py
@@ -141,7 +141,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
                 cur_pieces = self.sp_model.EncodeAsPieces(
                     piece[:-1].replace(SPIECE_UNDERLINE, ''))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
@@ -225,9 +225,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
         """
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
         An ALBERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 
-        | first sequence    | second sequence     
-        
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+
         if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 9454cbbaf3..219f17c404 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -133,9 +133,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
         self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
         self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index 5fda709448..68c6101860 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -72,7 +72,7 @@ def bytes_to_unicode():
     """
     Returns list of utf-8 byte and a mapping to unicode strings.
     We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
-    
+
     The reversible bpe codes work on unicode strings.
     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
@@ -122,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
         self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.errors = errors  # how to handle errors in decoding
         self.byte_encoder = bytes_to_unicode()
         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            bpe_merges = merges_handle.read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
         self.cache = {}
 
@@ -234,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                 writer.write(' '.join(bpe_tokens) + u'\n')
                 index += 1
 
-        return vocab_file, merge_file
\ No newline at end of file
+        return vocab_file, merge_file
diff --git a/transformers/tokenization_openai.py b/transformers/tokenization_openai.py
index 0efbdb37c0..a4c64b7020 100644
--- a/transformers/tokenization_openai.py
+++ b/transformers/tokenization_openai.py
@@ -101,9 +101,11 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             self.nlp = BasicTokenizer(do_lower_case=True)
             self.fix_text = None
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[1:-1]
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 5d683629f0..4c6cbd8986 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -347,7 +347,7 @@ class PreTrainedTokenizer(object):
                     "We assumed '{}' was a path or url to a directory containing vocabulary files "
                     "named {} but couldn't find such vocabulary files at this path or url.".format(
                         pretrained_model_name_or_path, ', '.join(s3_models),
-                        pretrained_model_name_or_path, 
+                        pretrained_model_name_or_path,
                         list(cls.vocab_files_names.values())))
 
         # Get files from url, cache, or disk depending on the case
@@ -382,7 +382,8 @@ class PreTrainedTokenizer(object):
         # Did we saved some inputs and kwargs to reload ?
         tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
         if tokenizer_config_file is not None:
-            init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
+            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+                init_kwargs = json.load(tokenizer_config_handle)
             saved_init_inputs = init_kwargs.pop('init_inputs', ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
@@ -407,7 +408,8 @@ class PreTrainedTokenizer(object):
             if args_name not in init_kwargs:
                 init_kwargs[args_name] = file_path
         if special_tokens_map_file is not None:
-            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
+            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+                special_tokens_map = json.load(special_tokens_map_handle)
             for key, value in special_tokens_map.items():
                 if key not in init_kwargs:
                     init_kwargs[key] = value
@@ -421,7 +423,8 @@ class PreTrainedTokenizer(object):
 
         # Add supplementary tokens.
         if added_tokens_file is not None:
-            added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
+            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                added_tok_encoder = json.load(added_tokens_handle)
             added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
             tokenizer.added_tokens_encoder.update(added_tok_encoder)
             tokenizer.added_tokens_decoder.update(added_tok_decoder)
@@ -937,7 +940,7 @@ class PreTrainedTokenizer(object):
             logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
                            "for this model ({} > {}). Running this sequence through the model will result in "
                            "indexing errors".format(len(ids), self.max_len))
-                           
+
         return encoded_inputs
 
     def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index ba994dc356..6c9f8e5e5c 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -524,7 +524,7 @@ class XLMTokenizer(PreTrainedTokenizer):
 
         - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
         (ex: "__classify__") to a vocabulary
-        
+
         - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
 
         - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
@@ -564,9 +564,11 @@ class XLMTokenizer(PreTrainedTokenizer):
         self.ja_word_tokenizer = None
         self.zh_word_tokenizer = None
 
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
         self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            merges = merges_handle.read().split('\n')[:-1]
         merges = [tuple(merge.split()[:2]) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py
index c01fbbbeeb..8c86a5bd60 100644
--- a/transformers/tokenization_xlnet.py
+++ b/transformers/tokenization_xlnet.py
@@ -141,7 +141,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
             pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
         new_pieces = []
         for piece in pieces:
-            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+            if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
                 cur_pieces = self.sp_model.EncodeAsPieces(
                     piece[:-1].replace(SPIECE_UNDERLINE, ''))
                 if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
@@ -227,7 +227,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
         An XLNet sequence pair mask has the following format:
         0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
         | first sequence    | second sequence     | CLS segment ID
-        
+
         if token_ids_1 is None, only returns the first portion of the mask (0's).
         """
         sep = [self.sep_token_id]