Add CodeParrot 🦜 codebase (#14536)
* add readme skeleton * update readme * add initialization script * add deduplication script * add codeparrot training script * add code generation evaluation * add validation loss script * add requirements * update readme * tweak readme * make style * add highlights to readme * add CLIs to scripts * add tokenizer training script * add docstring to constant length dataset * fix defaults in arguments * update readme with cli * move image to hub * tweaks of readme * fix cli commands * add author * explain env variables * fix formatting * Update examples/research_projects/codeparrot/README.md Co-authored-by: lewtun <lewis.c.tunstall@gmail.com> * Apply suggestions from code review Co-authored-by: lewtun <lewis.c.tunstall@gmail.com> * replace generic with gpt2 tokenizer Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
This commit is contained in:
committed by
GitHub
parent
e4c67d60ec
commit
43f953cc2e
@@ -0,0 +1,32 @@
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from arguments import TokenizerTrainingArguments
|
||||
from transformers import GPT2Tokenizer, HfArgumentParser
|
||||
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||
|
||||
|
||||
# Iterator for Training
|
||||
def batch_iterator(batch_size=10):
|
||||
for _ in tqdm(range(0, args.n_examples, batch_size)):
|
||||
yield [next(iter_dataset)[args.text_column] for _ in range(batch_size)]
|
||||
|
||||
|
||||
# Configuration
|
||||
parser = HfArgumentParser(TokenizerTrainingArguments)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Base tokenizer
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(args.base_tokenizer)
|
||||
base_vocab = list(bytes_to_unicode().values())
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset(args.dataset_name, split="train", streaming=True)
|
||||
iter_dataset = iter(dataset)
|
||||
|
||||
|
||||
# Training and saving
|
||||
new_tokenizer = tokenizer.train_new_from_iterator(
|
||||
batch_iterator(), vocab_size=args.vocab_size, initial_alphabet=base_vocab
|
||||
)
|
||||
new_tokenizer.save_pretrained(args.tokenizer_name, push_to_hub=args.push_to_hub)
|
||||
Reference in New Issue
Block a user