Remove dependency on pytest for running tests (#2055)
* Switch to plain unittest for skipping slow tests.
Add a RUN_SLOW environment variable for running them.
* Switch to plain unittest for PyTorch dependency.
* Switch to plain unittest for TensorFlow dependency.
* Avoid leaking open files in the test suite.
This prevents spurious warnings when running tests.
* Fix unicode warning on Python 2 when running tests.
The warning was:
UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
* Support running PyTorch tests on a GPU.
Reverts 27e015bd.
* Tests no longer require pytest.
* Make tests pass on cuda
This commit is contained in:
committed by
Julien Chaumond
parent
e4679cddce
commit
35401fe50f
@@ -72,7 +72,7 @@ def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a mapping to unicode strings.
|
||||
We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
|
||||
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
@@ -122,13 +122,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
|
||||
|
||||
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
|
||||
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||
self.encoder = json.load(vocab_handle)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
|
||||
with open(merges_file, encoding='utf-8') as merges_handle:
|
||||
bpe_merges = merges_handle.read().split('\n')[1:-1]
|
||||
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
self.cache = {}
|
||||
|
||||
@@ -234,4 +236,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
writer.write(' '.join(bpe_tokens) + u'\n')
|
||||
index += 1
|
||||
|
||||
return vocab_file, merge_file
|
||||
return vocab_file, merge_file
|
||||
|
||||
Reference in New Issue
Block a user