Reformat source code with black.

This is the result of:

    $ black --line-length 119 examples templates transformers utils hubconf.py setup.py

There's a lot of fairly long lines in the project. As a consequence, I'm
picking the longest widely accepted line length, 119 characters.

This is also Thomas' preference, because it allows for explicit variable
names, to make the code easier to understand.
This commit is contained in:
Aymeric Augustin
2019-12-21 15:46:46 +01:00
parent 63e3827c6b
commit fa84ae26d6
200 changed files with 17452 additions and 12594 deletions

View File

@@ -23,68 +23,65 @@ import numpy as np
from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
import logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO)
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
)
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
parser.add_argument('--file_path', type=str, default='data/dump.txt',
help='The path to the data.')
parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
help="The tokenizer to use.")
parser.add_argument('--dump_file', type=str, default='data/dump',
help='The dump file prefix.')
parser = argparse.ArgumentParser(
description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
)
parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
args = parser.parse_args()
logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
if args.tokenizer_type == 'bert':
logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
if args.tokenizer_type == "bert":
tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
elif args.tokenizer_type == 'roberta':
bos = tokenizer.special_tokens_map["cls_token"] # `[CLS]`
sep = tokenizer.special_tokens_map["sep_token"] # `[SEP]`
elif args.tokenizer_type == "roberta":
tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
elif args.tokenizer_type == 'gpt2':
bos = tokenizer.special_tokens_map["cls_token"] # `<s>`
sep = tokenizer.special_tokens_map["sep_token"] # `</s>`
elif args.tokenizer_type == "gpt2":
tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`
bos = tokenizer.special_tokens_map["bos_token"] # `<|endoftext|>`
sep = tokenizer.special_tokens_map["eos_token"] # `<|endoftext|>`
logger.info(f'Loading text from {args.file_path}')
with open(args.file_path, 'r', encoding='utf8') as fp:
logger.info(f"Loading text from {args.file_path}")
with open(args.file_path, "r", encoding="utf8") as fp:
data = fp.readlines()
logger.info(f'Start encoding')
logger.info(f'{len(data)} examples to process.')
logger.info(f"Start encoding")
logger.info(f"{len(data)} examples to process.")
rslt = []
iter = 0
interval = 10000
start = time.time()
for text in data:
text = f'{bos} {text.strip()} {sep}'
text = f"{bos} {text.strip()} {sep}"
token_ids = tokenizer.encode(text, add_special_tokens=False)
rslt.append(token_ids)
iter += 1
if iter % interval == 0:
end = time.time()
logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
logger.info(f"{iter} examples processed. - {(end-start)/interval:.2f}s/expl")
start = time.time()
logger.info('Finished binarization')
logger.info(f'{len(data)} examples processed.')
logger.info("Finished binarization")
logger.info(f"{len(data)} examples processed.")
dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
rslt_ = [np.uint16(d) for d in rslt]
random.shuffle(rslt_)
logger.info(f'Dump to {dp_file}')
with open(dp_file, 'wb') as handle:
logger.info(f"Dump to {dp_file}")
with open(dp_file, "wb") as handle:
pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)