Reformat source code with black.
This is the result of:
$ black --line-length 119 examples templates transformers utils hubconf.py setup.py
There's a lot of fairly long lines in the project. As a consequence, I'm
picking the longest widely accepted line length, 119 characters.
This is also Thomas' preference, because it allows for explicit variable
names, to make the code easier to understand.
This commit is contained in:
@@ -21,6 +21,7 @@ from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
from utils import logger
|
||||
|
||||
|
||||
class LmSeqsDataset(Dataset):
|
||||
"""Custom Dataset wrapping language modeling sequences.
|
||||
|
||||
@@ -32,9 +33,7 @@ class LmSeqsDataset(Dataset):
|
||||
data: `List[np.array[int]]
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
params,
|
||||
data):
|
||||
def __init__(self, params, data):
|
||||
self.params = params
|
||||
|
||||
self.token_ids = np.array(data)
|
||||
@@ -57,7 +56,7 @@ class LmSeqsDataset(Dataset):
|
||||
Some sanity checks
|
||||
"""
|
||||
assert len(self.token_ids) == len(self.lengths)
|
||||
assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
|
||||
assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths)))
|
||||
|
||||
def remove_long_sequences(self):
|
||||
"""
|
||||
@@ -65,17 +64,17 @@ class LmSeqsDataset(Dataset):
|
||||
"""
|
||||
max_len = self.params.max_model_input_size
|
||||
indices = self.lengths > max_len
|
||||
logger.info(f'Splitting {sum(indices)} too long sequences.')
|
||||
logger.info(f"Splitting {sum(indices)} too long sequences.")
|
||||
|
||||
def divide_chunks(l, n):
|
||||
return [l[i:i + n] for i in range(0, len(l), n)]
|
||||
return [l[i : i + n] for i in range(0, len(l), n)]
|
||||
|
||||
new_tok_ids = []
|
||||
new_lengths = []
|
||||
if self.params.mlm:
|
||||
cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
|
||||
cls_id, sep_id = self.params.special_tok_ids["cls_token"], self.params.special_tok_ids["sep_token"]
|
||||
else:
|
||||
cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
|
||||
cls_id, sep_id = self.params.special_tok_ids["bos_token"], self.params.special_tok_ids["eos_token"]
|
||||
|
||||
for seq_, len_ in zip(self.token_ids, self.lengths):
|
||||
assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
|
||||
@@ -84,7 +83,7 @@ class LmSeqsDataset(Dataset):
|
||||
new_lengths.append(len_)
|
||||
else:
|
||||
sub_seqs = []
|
||||
for sub_s in divide_chunks(seq_, max_len-2):
|
||||
for sub_s in divide_chunks(seq_, max_len - 2):
|
||||
if sub_s[0] != cls_id:
|
||||
sub_s = np.insert(sub_s, 0, cls_id)
|
||||
if sub_s[-1] != sep_id:
|
||||
@@ -108,7 +107,7 @@ class LmSeqsDataset(Dataset):
|
||||
self.token_ids = self.token_ids[indices]
|
||||
self.lengths = self.lengths[indices]
|
||||
new_size = len(self)
|
||||
logger.info(f'Remove {init_size - new_size} too short (<=11 tokens) sequences.')
|
||||
logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
|
||||
|
||||
def print_statistics(self):
|
||||
"""
|
||||
@@ -116,7 +115,7 @@ class LmSeqsDataset(Dataset):
|
||||
"""
|
||||
if not self.params.is_master:
|
||||
return
|
||||
logger.info(f'{len(self)} sequences')
|
||||
logger.info(f"{len(self)} sequences")
|
||||
# data_len = sum(self.lengths)
|
||||
# nb_unique_tokens = len(Counter(list(chain(*self.token_ids))))
|
||||
# logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
|
||||
@@ -125,8 +124,7 @@ class LmSeqsDataset(Dataset):
|
||||
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
||||
|
||||
def batch_sequences(self,
|
||||
batch):
|
||||
def batch_sequences(self, batch):
|
||||
"""
|
||||
Do the padding and transform into torch.tensor.
|
||||
"""
|
||||
@@ -139,13 +137,13 @@ class LmSeqsDataset(Dataset):
|
||||
|
||||
# Pad token ids
|
||||
if self.params.mlm:
|
||||
pad_idx = self.params.special_tok_ids['pad_token']
|
||||
pad_idx = self.params.special_tok_ids["pad_token"]
|
||||
else:
|
||||
pad_idx = self.params.special_tok_ids['unk_token']
|
||||
tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
|
||||
pad_idx = self.params.special_tok_ids["unk_token"]
|
||||
tk_ = [list(t.astype(int)) + [pad_idx] * (max_seq_len_ - len(t)) for t in token_ids]
|
||||
assert len(tk_) == len(token_ids)
|
||||
assert all(len(t) == max_seq_len_ for t in tk_)
|
||||
|
||||
tk_t = torch.tensor(tk_) # (bs, max_seq_len_)
|
||||
tk_t = torch.tensor(tk_) # (bs, max_seq_len_)
|
||||
lg_t = torch.tensor(lengths) # (bs)
|
||||
return tk_t, lg_t
|
||||
|
||||
Reference in New Issue
Block a user