clean up all byte-level bpe tests
This commit is contained in:
@@ -57,12 +57,12 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||||
text = "lower newer"
|
text = "lower newer"
|
||||||
bpe_tokens = ["\u0120low", "er", "\u0120newer"]
|
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
|
||||||
tokens = tokenizer.tokenize(text)
|
tokens = tokenizer.tokenize(text)
|
||||||
self.assertListEqual(tokens, bpe_tokens)
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
input_tokens = tokens + [tokenizer.unk_token]
|
input_tokens = tokens + [tokenizer.unk_token]
|
||||||
input_bpe_tokens = [14, 15, 19]
|
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -55,13 +55,13 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
|
|||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
||||||
text = "lower"
|
text = "lower newer"
|
||||||
bpe_tokens = ["\u0120low", "er"]
|
bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
|
||||||
tokens = tokenizer.tokenize(text)
|
tokens = tokenizer.tokenize(text)
|
||||||
self.assertListEqual(tokens, bpe_tokens)
|
self.assertListEqual(tokens, bpe_tokens)
|
||||||
|
|
||||||
input_tokens = tokens + [tokenizer.unk_token]
|
input_tokens = tokens + [tokenizer.unk_token]
|
||||||
input_bpe_tokens = [14, 15, 19]
|
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
||||||
|
|
||||||
|
|||||||
@@ -64,13 +64,14 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def bytes_to_unicode():
|
def bytes_to_unicode():
|
||||||
"""
|
"""
|
||||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
Returns list of utf-8 byte and a mapping to unicode strings.
|
||||||
|
We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
|
|
||||||
The reversible bpe codes work on unicode strings.
|
The reversible bpe codes work on unicode strings.
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
|
||||||
"""
|
"""
|
||||||
_chr = unichr if sys.version_info[0] == 2 else chr
|
_chr = unichr if sys.version_info[0] == 2 else chr
|
||||||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||||
@@ -176,9 +177,9 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
|||||||
bpe_tokens = []
|
bpe_tokens = []
|
||||||
for token in re.findall(self.pat, text):
|
for token in re.findall(self.pat, text):
|
||||||
if sys.version_info[0] == 2:
|
if sys.version_info[0] == 2:
|
||||||
token = ''.join(self.byte_encoder[ord(b)] for b in token)
|
token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||||
else:
|
else:
|
||||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
|
||||||
return bpe_tokens
|
return bpe_tokens
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user