Remove dependency on pytest for running tests (#2055)

* Switch to plain unittest for skipping slow tests. Add a RUN_SLOW environment variable for running them. * Switch to plain unittest for PyTorch dependency. * Switch to plain unittest for TensorFlow dependency. * Avoid leaking open files in the test suite. This prevents spurious warnings when running tests. * Fix unicode warning on Python 2 when running tests. The warning was: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal * Support running PyTorch tests on a GPU. Reverts 27e015bd. * Tests no longer require pytest. * Make tests pass on cuda
2019-12-06 19:57:38 +01:00
parent e4679cddce
commit 35401fe50f
50 changed files with 344 additions and 231 deletions
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -72,7 +72,7 @@ def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a mapping to unicode strings.
    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
-    
+
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
@@ -122,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens

-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.errors = errors  # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        with open(merges_file, encoding='utf-8') as merges_handle:
+            bpe_merges = merges_handle.read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

@@ -234,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
                writer.write(' '.join(bpe_tokens) + u'\n')
                index += 1

-        return vocab_file, merge_file
+        return vocab_file, merge_file