Add CodeParrot 🦜 codebase (#14536)
* add readme skeleton * update readme * add initialization script * add deduplication script * add codeparrot training script * add code generation evaluation * add validation loss script * add requirements * update readme * tweak readme * make style * add highlights to readme * add CLIs to scripts * add tokenizer training script * add docstring to constant length dataset * fix defaults in arguments * update readme with cli * move image to hub * tweaks of readme * fix cli commands * add author * explain env variables * fix formatting * Update examples/research_projects/codeparrot/README.md Co-authored-by: lewtun <lewis.c.tunstall@gmail.com> * Apply suggestions from code review Co-authored-by: lewtun <lewis.c.tunstall@gmail.com> * replace generic with gpt2 tokenizer Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
This commit is contained in:
committed by
GitHub
parent
e4c67d60ec
commit
43f953cc2e
175
examples/research_projects/codeparrot/scripts/arguments.py
Normal file
175
examples/research_projects/codeparrot/scripts/arguments.py
Normal file
@@ -0,0 +1,175 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainingArguments:
|
||||
"""
|
||||
Configuration for training model.
|
||||
"""
|
||||
|
||||
model_ckpt: Optional[str] = field(
|
||||
default="lvwerra/codeparrot",
|
||||
metadata={"help": "Model name or path of model to be trained."},
|
||||
)
|
||||
save_dir: Optional[str] = field(
|
||||
default="./",
|
||||
metadata={"help": "Save dir where model repo is cloned and models updates are saved to."},
|
||||
)
|
||||
dataset_name_train: Optional[str] = field(
|
||||
default="lvwerra/codeparrot-clean-train", metadata={"help": "Name or path of training dataset."}
|
||||
)
|
||||
dataset_name_valid: Optional[str] = field(
|
||||
default="lvwerra/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
|
||||
)
|
||||
train_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for training."})
|
||||
valid_batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size for evaluation."})
|
||||
weight_decay: Optional[float] = field(default=0.1, metadata={"help": "Value of weight decay."})
|
||||
shuffle_buffer: Optional[int] = field(
|
||||
default=1000, metadata={"help": "Size of buffer used to shuffle streaming dataset."}
|
||||
)
|
||||
learning_rate: Optional[float] = field(default=2e-4, metadata={"help": "Learning rate fo training."})
|
||||
lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "Learning rate."})
|
||||
num_warmup_steps: Optional[int] = field(
|
||||
default=750, metadata={"help": "Number of warmup steps in the learning rate schedule."}
|
||||
)
|
||||
gradient_accumulation_steps: Optional[int] = field(
|
||||
default=16, metadata={"help": "Number of gradient accumulation steps."}
|
||||
)
|
||||
gradient_checkpointing: Optional[bool] = field(
|
||||
default=True, metadata={"help": "Use gradient checkpointing to reduce memory footprint."}
|
||||
)
|
||||
max_train_steps: Optional[int] = field(default=50_000, metadata={"help": "Maximum number of training steps."})
|
||||
max_eval_steps: Optional[int] = field(
|
||||
default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
|
||||
)
|
||||
seq_length: Optional[int] = field(default=1024, metadata={"help": "Sequence lengths used for training."})
|
||||
seed: Optional[int] = field(default=1, metadata={"help": "Training seed."})
|
||||
save_checkpoint_steps: Optional[int] = field(
|
||||
default=1024,
|
||||
metadata={"help": "Interval to save checkpoints. Measured as number of forward passes not training steps."},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvaluationArguments:
|
||||
"""
|
||||
Configuration for evaluating model.
|
||||
"""
|
||||
|
||||
model_ckpt: Optional[str] = field(
|
||||
default="lvwerra/codeparrot",
|
||||
metadata={"help": "Model name or path of model to be evaluated."},
|
||||
)
|
||||
dataset_name: Optional[str] = field(
|
||||
default="lvwerra/codeparrot-clean-valid", metadata={"help": "Name or path of validation dataset."}
|
||||
)
|
||||
batch_size: Optional[int] = field(default=2, metadata={"help": "Batch size used for evaluation."})
|
||||
max_eval_steps: Optional[int] = field(
|
||||
default=-1, metadata={"help": "Maximum number of evaluation steps. If -1 the full dataset is evaluated."}
|
||||
)
|
||||
seq_length: Optional[int] = field(default=1024, metadata={"help": "Length of sequences to be evaluated."})
|
||||
seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
|
||||
|
||||
|
||||
@dataclass
|
||||
class HumanEvalArguments:
|
||||
"""
|
||||
Configuration for running evaluation on HumanEval dataset.
|
||||
"""
|
||||
|
||||
model_ckpt: Optional[str] = field(
|
||||
default="lvwerra/codeparrot",
|
||||
metadata={"help": "Model name or path of model to be evaluated."},
|
||||
)
|
||||
num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
|
||||
do_sample: Optional[bool] = field(
|
||||
default=True, metadata={"help": "Sample from the language model's output distribution."}
|
||||
)
|
||||
temperature: Optional[float] = field(default=0.2, metadata={"help": "Sampling temperature used for generation."})
|
||||
max_new_tokens: Optional[int] = field(default=256, metadata={"help": "Maximum number of newly generated tokens."})
|
||||
top_k: Optional[int] = field(default=0, metadata={"help": "Top-k parameter used for generation."})
|
||||
top_p: Optional[float] = field(default=0.95, metadata={"help": "Top-p parameter used for nucleus sampling."})
|
||||
batch_size: Optional[int] = field(default=10, metadata={"help": "Number of generations to run in parallel."})
|
||||
n_samples: Optional[int] = field(
|
||||
default=200, metadata={"help": "Number of completions to generate for each sample."}
|
||||
)
|
||||
seed: Optional[int] = field(default=1, metadata={"help": "Random seed used for evaluation."})
|
||||
output_file: Optional[str] = field(
|
||||
default="eval_results.json", metadata={"help": "Random seed used for evaluation."}
|
||||
)
|
||||
HF_ALLOW_CODE_EVAL: Optional[str] = field(
|
||||
default="0", metadata={"help": "Allow `code_eval` to execute Python code on machine"}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessingArguments:
|
||||
"""
|
||||
Configuration for preprocessing data.
|
||||
"""
|
||||
|
||||
num_workers: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The number of CPU cores to use for parallel preprocessing. Default uses the maximum available."
|
||||
},
|
||||
)
|
||||
dataset_name: Optional[str] = field(
|
||||
default="codeparrot", metadata={"help": "Folder or name of dataset to process."}
|
||||
)
|
||||
output_dir: Optional[str] = field(
|
||||
default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
|
||||
)
|
||||
samples_per_file: Optional[int] = field(
|
||||
default=100_000, metadata={"help": "Number of files to save per JSON output file."}
|
||||
)
|
||||
text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
|
||||
line_max: Optional[float] = field(
|
||||
default=1000, metadata={"help": "Maximum line length in file, otherwise file is filtered."}
|
||||
)
|
||||
line_mean: Optional[float] = field(
|
||||
default=100, metadata={"help": "Maximum mean line length in file, otherwise file is filtered."}
|
||||
)
|
||||
alpha_frac: Optional[float] = field(
|
||||
default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizerTrainingArguments:
|
||||
"""
|
||||
Configuration for tokenizer training.
|
||||
"""
|
||||
|
||||
base_tokenizer: Optional[str] = field(
|
||||
default="gpt2",
|
||||
metadata={"help": "Base tokenizer to build new tokenizer from."},
|
||||
)
|
||||
dataset_name: Optional[str] = field(
|
||||
default="transformersbook/codeparrot-train", metadata={"help": "Dataset to train tokenizer on."}
|
||||
)
|
||||
text_column: Optional[str] = field(default="content", metadata={"help": "Column containing text data to process."})
|
||||
vocab_size: Optional[int] = field(default=200000, metadata={"help": "Number of examples to train tokenizer on."})
|
||||
n_examples: Optional[int] = field(
|
||||
default=32768, metadata={"help": "Number of examples to train the tokenizer on."}
|
||||
)
|
||||
tokenizer_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of new tokenizer."})
|
||||
push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})
|
||||
|
||||
|
||||
@dataclass
|
||||
class InitializationArguments:
|
||||
"""
|
||||
Configuration for initializing new model.
|
||||
"""
|
||||
|
||||
config_name: Optional[str] = field(
|
||||
default="gpt2-large",
|
||||
metadata={"help": "Configuration to use for model initialization."},
|
||||
)
|
||||
tokenizer_name: Optional[str] = field(
|
||||
default="lvwerra/codeparrot", metadata={"help": "Tokenizer attached to model."}
|
||||
)
|
||||
model_name: Optional[str] = field(default="codeparrot", metadata={"help": "Name of the created model."})
|
||||
push_to_hub: Optional[bool] = field(default=True, metadata={"help": "Push saved tokenizer to the hub."})
|
||||
@@ -0,0 +1,32 @@
|
||||
from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from arguments import TokenizerTrainingArguments
|
||||
from transformers import GPT2Tokenizer, HfArgumentParser
|
||||
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||
|
||||
|
||||
# Iterator for Training
|
||||
def batch_iterator(batch_size=10):
|
||||
for _ in tqdm(range(0, args.n_examples, batch_size)):
|
||||
yield [next(iter_dataset)[args.text_column] for _ in range(batch_size)]
|
||||
|
||||
|
||||
# Configuration
|
||||
parser = HfArgumentParser(TokenizerTrainingArguments)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Base tokenizer
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(args.base_tokenizer)
|
||||
base_vocab = list(bytes_to_unicode().values())
|
||||
|
||||
# Load dataset
|
||||
dataset = load_dataset(args.dataset_name, split="train", streaming=True)
|
||||
iter_dataset = iter(dataset)
|
||||
|
||||
|
||||
# Training and saving
|
||||
new_tokenizer = tokenizer.train_new_from_iterator(
|
||||
batch_iterator(), vocab_size=args.vocab_size, initial_alphabet=base_vocab
|
||||
)
|
||||
new_tokenizer.save_pretrained(args.tokenizer_name, push_to_hub=args.push_to_hub)
|
||||
@@ -0,0 +1,240 @@
|
||||
import logging
|
||||
from argparse import Namespace
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import IterableDataset
|
||||
from torch.utils.data.dataloader import DataLoader
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
import transformers
|
||||
import wandb
|
||||
from accelerate import Accelerator
|
||||
from arguments import TrainingArguments
|
||||
from huggingface_hub import Repository
|
||||
from transformers import AdamW, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, get_scheduler, set_seed
|
||||
|
||||
|
||||
class ConstantLengthDataset(IterableDataset):
|
||||
"""
|
||||
Iterable dataset that returns constant length chunks of tokens from stream of text files.
|
||||
Args:
|
||||
tokenizer (Tokenizer): The processor used for proccessing the data.
|
||||
dataset (dataset.Dataset): Dataset with text files.
|
||||
infinite (bool): If True the iterator is reset after dataset reaches end else stops.
|
||||
seq_length (int): Length of token sequences to return.
|
||||
num_of_sequences: Number of token sequences to keep in buffer.
|
||||
chars_per_token: Number of characters per token used to estimate number of tokens in text buffer.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenizer, dataset, infinite=False, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6
|
||||
):
|
||||
self.tokenizer = tokenizer
|
||||
self.concat_token_id = tokenizer.bos_token_id
|
||||
self.dataset = dataset
|
||||
self.seq_length = seq_length
|
||||
self.input_characters = seq_length * chars_per_token * num_of_sequences
|
||||
self.epoch = 0
|
||||
self.infinite = infinite
|
||||
|
||||
def __iter__(self):
|
||||
iterator = iter(self.dataset)
|
||||
more_examples = True
|
||||
while more_examples:
|
||||
buffer, buffer_len = [], 0
|
||||
while True:
|
||||
if buffer_len >= self.input_characters:
|
||||
break
|
||||
try:
|
||||
buffer.append(next(iterator)["content"])
|
||||
buffer_len += len(buffer[-1])
|
||||
except StopIteration:
|
||||
if self.infinite:
|
||||
iterator = iter(self.dataset)
|
||||
self.epoch += 1
|
||||
logger.info(f"Dataset epoch: {self.epoch}")
|
||||
else:
|
||||
more_examples = False
|
||||
break
|
||||
tokenized_inputs = tokenizer(buffer, truncation=False)["input_ids"]
|
||||
all_token_ids = []
|
||||
for tokenized_input in tokenized_inputs:
|
||||
all_token_ids.extend(tokenized_input + [self.concat_token_id])
|
||||
for i in range(0, len(all_token_ids), self.seq_length):
|
||||
input_ids = all_token_ids[i : i + self.seq_length]
|
||||
if len(input_ids) == self.seq_length:
|
||||
yield torch.tensor(input_ids)
|
||||
|
||||
|
||||
def setup_logging(args):
|
||||
project_name = args.model_ckpt.split("/")[-1]
|
||||
logger = logging.getLogger(__name__)
|
||||
log_dir = Path(args.save_dir) / "log/"
|
||||
log_dir.mkdir(exist_ok=True)
|
||||
filename = f"debug_{accelerator.process_index}.log"
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
handlers=[logging.FileHandler(log_dir / filename), logging.StreamHandler()],
|
||||
)
|
||||
if accelerator.is_main_process: # we only want to setup logging once
|
||||
wandb.init(project=project_name, config=args)
|
||||
run_name = wandb.run.name
|
||||
tb_writer = SummaryWriter()
|
||||
tb_writer.add_hparams(vars(args), {"0": 0})
|
||||
logger.setLevel(logging.INFO)
|
||||
datasets.utils.logging.set_verbosity_info()
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
else:
|
||||
tb_writer = None
|
||||
run_name = ""
|
||||
logger.setLevel(logging.ERROR)
|
||||
datasets.utils.logging.set_verbosity_error()
|
||||
transformers.utils.logging.set_verbosity_error()
|
||||
return logger, tb_writer, run_name
|
||||
|
||||
|
||||
def create_dataloaders(args):
|
||||
ds_kwargs = {"streaming": True}
|
||||
train_data = load_dataset(args.dataset_name_train, split="train", **ds_kwargs)
|
||||
train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
|
||||
valid_data = load_dataset(args.dataset_name_valid, split="train", **ds_kwargs)
|
||||
train_dataset = ConstantLengthDataset(tokenizer, train_data, infinite=True, seq_length=args.seq_length)
|
||||
valid_dataset = ConstantLengthDataset(tokenizer, valid_data, infinite=False, seq_length=args.seq_length)
|
||||
train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size)
|
||||
eval_dataloader = DataLoader(valid_dataset, batch_size=args.valid_batch_size)
|
||||
return train_dataloader, eval_dataloader
|
||||
|
||||
|
||||
def get_grouped_params(model, args, no_decay=["bias", "LayerNorm.weight"]):
|
||||
params_with_wd, params_without_wd = [], []
|
||||
for n, p in model.named_parameters():
|
||||
if any(nd in n for nd in no_decay):
|
||||
params_without_wd.append(p)
|
||||
else:
|
||||
params_with_wd.append(p)
|
||||
return [
|
||||
{"params": params_with_wd, "weight_decay": args.weight_decay},
|
||||
{"params": params_without_wd, "weight_decay": 0.0},
|
||||
]
|
||||
|
||||
|
||||
def log_metrics(step, metrics):
|
||||
logger.info(f"Step {step}: {metrics}")
|
||||
if accelerator.is_main_process:
|
||||
wandb.log(metrics)
|
||||
[tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
|
||||
|
||||
|
||||
def evaluate(args):
|
||||
model.eval()
|
||||
losses = []
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
with torch.no_grad():
|
||||
outputs = model(batch, labels=batch)
|
||||
loss = outputs.loss.repeat(args.valid_batch_size)
|
||||
losses.append(accelerator.gather(loss))
|
||||
if args.max_eval_steps > 0 and step >= args.max_eval_steps:
|
||||
break
|
||||
loss = torch.mean(torch.cat(losses))
|
||||
try:
|
||||
perplexity = torch.exp(loss)
|
||||
except OverflowError:
|
||||
perplexity = float("inf")
|
||||
return loss.item(), perplexity.item()
|
||||
|
||||
|
||||
# Accelerator
|
||||
accelerator = Accelerator()
|
||||
acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
|
||||
|
||||
# Settings
|
||||
parser = HfArgumentParser(TrainingArguments)
|
||||
args = parser.parse_args()
|
||||
|
||||
args = Namespace(**vars(args), **acc_state)
|
||||
samples_per_step = accelerator.state.num_processes * args.train_batch_size
|
||||
set_seed(args.seed)
|
||||
|
||||
# Clone model repository
|
||||
if accelerator.is_main_process:
|
||||
hf_repo = Repository(args.save_dir, clone_from=args.model_ckpt)
|
||||
|
||||
# Logging
|
||||
logger, tb_writer, run_name = setup_logging(args)
|
||||
logger.info(accelerator.state)
|
||||
|
||||
# Checkout new branch on repo
|
||||
if accelerator.is_main_process:
|
||||
hf_repo.git_checkout(run_name, create_branch_ok=True)
|
||||
|
||||
# Load model and tokenizer
|
||||
model = AutoModelForCausalLM.from_pretrained(args.save_dir)
|
||||
if args.gradient_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.save_dir)
|
||||
|
||||
# Load dataset and dataloader
|
||||
train_dataloader, eval_dataloader = create_dataloaders(args)
|
||||
|
||||
# Prepare the optimizer and learning rate scheduler
|
||||
optimizer = AdamW(get_grouped_params(model, args), lr=args.learning_rate)
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
)
|
||||
|
||||
|
||||
def get_lr():
|
||||
return optimizer.param_groups[0]["lr"]
|
||||
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
||||
model, optimizer, train_dataloader, eval_dataloader
|
||||
)
|
||||
|
||||
# Train model
|
||||
model.train()
|
||||
completed_steps = 0
|
||||
for step, batch in enumerate(train_dataloader, start=1):
|
||||
loss = model(batch, labels=batch, use_cache=False).loss
|
||||
log_metrics(
|
||||
step, {"lr": get_lr(), "samples": step * samples_per_step, "steps": completed_steps, "loss/train": loss.item()}
|
||||
)
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0:
|
||||
accelerator.clip_grad_norm_(model.parameters(), 1.0)
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
completed_steps += 1
|
||||
if step % args.save_checkpoint_steps == 0:
|
||||
logger.info("Evaluating and saving model checkpoint")
|
||||
eval_loss, perplexity = evaluate(args)
|
||||
log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
|
||||
accelerator.wait_for_everyone()
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.save_pretrained(args.save_dir, save_function=accelerator.save)
|
||||
if accelerator.is_main_process:
|
||||
hf_repo.push_to_hub(commit_message=f"step {step}")
|
||||
model.train()
|
||||
if completed_steps >= args.max_train_steps:
|
||||
break
|
||||
|
||||
# Evaluate and save the last checkpoint
|
||||
logger.info("Evaluating and saving model after training")
|
||||
eval_loss, perplexity = evaluate(args)
|
||||
log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
|
||||
accelerator.wait_for_everyone()
|
||||
unwrapped_model = accelerator.unwrap_model(model)
|
||||
unwrapped_model.save_pretrained(args.save_dir, save_function=accelerator.save)
|
||||
if accelerator.is_main_process:
|
||||
hf_repo.push_to_hub(commit_message="final model")
|
||||
87
examples/research_projects/codeparrot/scripts/human_eval.py
Normal file
87
examples/research_projects/codeparrot/scripts/human_eval.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import re
|
||||
|
||||
from datasets import load_dataset, load_metric
|
||||
from tqdm import tqdm
|
||||
|
||||
import transformers
|
||||
from arguments import HumanEvalArguments
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, pipeline, set_seed
|
||||
|
||||
|
||||
def first_block(string):
|
||||
"""Split off first block of code by scanning for class, def etc. on newlines."""
|
||||
return re.split("\nclass|\ndef|\n#|\n@|\nprint|\nif", string)[0].rstrip()
|
||||
|
||||
|
||||
def complete_code(pipe, prompt, num_completions=1, **gen_kwargs):
|
||||
"""Complete prompt with text generation pipeline and return num_completions."""
|
||||
prompt = pipe.tokenizer.eos_token + prompt
|
||||
code_gens = pipe(prompt, num_return_sequences=num_completions, **gen_kwargs)
|
||||
return [first_block(code_gen["generated_text"][len(prompt) :]) for code_gen in code_gens]
|
||||
|
||||
|
||||
def main():
|
||||
# Setup configuration
|
||||
parser = HfArgumentParser(HumanEvalArguments)
|
||||
args = parser.parse_args()
|
||||
|
||||
transformers.logging.set_verbosity_error()
|
||||
# enables code execution in code_eval metric
|
||||
os.environ["HF_ALLOW_CODE_EVAL"] = args.HF_ALLOW_CODE_EVAL
|
||||
# make sure tokenizer plays nice with multiprocessing
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
if args.num_workers is None:
|
||||
args.num_workers = multiprocessing.cpu_count()
|
||||
|
||||
set_seed(args.seed)
|
||||
|
||||
# Generation settings
|
||||
gen_kwargs = {
|
||||
"do_sample": args.do_sample,
|
||||
"temperature": args.temperature,
|
||||
"max_new_tokens": args.max_new_tokens,
|
||||
"top_p": args.top_p,
|
||||
"top_k": args.top_k,
|
||||
}
|
||||
|
||||
# Load model and tokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_ckpt)
|
||||
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
|
||||
|
||||
# Load evaluation dataset and metric
|
||||
human_eval = load_dataset("openai_humaneval")
|
||||
code_eval_metric = load_metric("code_eval")
|
||||
|
||||
# Generate completions for evaluation set
|
||||
n_tasks = 4 # len(human_eval["test"])
|
||||
generations, references = [], []
|
||||
for task in tqdm(range(n_tasks)):
|
||||
task_generations = []
|
||||
prompt = human_eval["test"][task]["prompt"].strip()
|
||||
for batch in range(args.n_samples // args.batch_size):
|
||||
task_generations.extend(complete_code(pipe, prompt, num_completions=args.batch_size, **gen_kwargs))
|
||||
generations.append([prompt + gen for gen in task_generations])
|
||||
test_func = human_eval["test"][task]["test"]
|
||||
entry_point = f"check({human_eval['test'][task]['entry_point']})"
|
||||
references.append("\n" + test_func + "\n" + entry_point)
|
||||
|
||||
# Evaluate completions with "code_eval" metric
|
||||
pass_at_k, _ = code_eval_metric.compute(
|
||||
references=references, predictions=generations, num_workers=args.num_workers
|
||||
)
|
||||
print(f"Results: {pass_at_k}")
|
||||
|
||||
# Save results to json file
|
||||
with open(args.output_file, "w") as fp:
|
||||
json.dump(pass_at_k, fp)
|
||||
|
||||
|
||||
# For some reason the folliwng seems to be necessary sometimes for code_eval to work nice with multiprocessing
|
||||
# https://stackoverflow.com/questions/60804599/python-multiprocessing-keeps-spawning-the-whole-script
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,22 @@
|
||||
from arguments import InitializationArguments
|
||||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
|
||||
|
||||
|
||||
# Configuration
|
||||
parser = HfArgumentParser(InitializationArguments)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load codeparrot tokenizer trained for Python code tokenization
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
|
||||
|
||||
# Config: "scale_attn_by_layer_idx" and "reorder_and_upcast_attn" are Mistral stability tweaks
|
||||
config_kwargs = {"vocab_size": len(tokenizer), "scale_attn_by_layer_idx": True, "reorder_and_upcast_attn": True}
|
||||
|
||||
# Load model config (GPT-2 large in this case)
|
||||
config = AutoConfig.from_pretrained(args.config_name, **config_kwargs)
|
||||
|
||||
# Initialize new model with config
|
||||
model = AutoModelForCausalLM(config)
|
||||
|
||||
# Save model to the hub
|
||||
model.save_pretrained(args.model_name, push_to_hub=args.push_to_hub)
|
||||
122
examples/research_projects/codeparrot/scripts/preprocessing.py
Normal file
122
examples/research_projects/codeparrot/scripts/preprocessing.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import gzip
|
||||
import multiprocessing
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
from datasets import load_dataset
|
||||
|
||||
from arguments import PreprocessingArguments
|
||||
from transformers import HfArgumentParser
|
||||
|
||||
|
||||
def get_hash(example):
|
||||
"""Get hash of content field."""
|
||||
return {"hash": hash(example["content"])}
|
||||
|
||||
|
||||
def line_stats(example):
|
||||
"""Calculates mean and max line length of file."""
|
||||
line_lengths = [len(line) for line in example["content"].splitlines()]
|
||||
return {"line_mean": np.mean(line_lengths), "line_max": max(line_lengths)}
|
||||
|
||||
|
||||
def alpha_stats(example):
|
||||
"""Calculates mean and max line length of file."""
|
||||
alpha_frac = np.mean([c.isalnum() for c in example["content"]])
|
||||
return {"alpha_frac": alpha_frac}
|
||||
|
||||
|
||||
def check_uniques(example, uniques):
|
||||
"""Check if current hash is still in set of unique hashes and remove if true."""
|
||||
if example["hash"] in uniques:
|
||||
uniques.remove(example["hash"])
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def is_autogenerated(example, scan_width=5):
|
||||
"""Check if file is autogenerated by looking for keywords in the first few lines of the file."""
|
||||
keywords = ["auto-generated", "autogenerated", "automatically generated"]
|
||||
lines = example["content"].splitlines()
|
||||
for _, line in zip(range(scan_width), lines):
|
||||
for keyword in keywords:
|
||||
if keyword in line.lower():
|
||||
return {"autogenerated": True}
|
||||
else:
|
||||
return {"autogenerated": False}
|
||||
|
||||
|
||||
def preprocess(example):
|
||||
"""Chain all preprocessing steps into one function to not fill cache."""
|
||||
results = dict()
|
||||
results.update(get_hash(example))
|
||||
results.update(line_stats(example))
|
||||
results.update(alpha_stats(example))
|
||||
results.update(is_autogenerated(example))
|
||||
return results
|
||||
|
||||
|
||||
def filter(example, uniques, args):
|
||||
"""Filter dataset with heuristics."""
|
||||
if not check_uniques(example, uniques):
|
||||
return False
|
||||
elif example["autogenerated"]:
|
||||
return False
|
||||
elif example["line_max"] > args.line_max:
|
||||
return False
|
||||
elif example["line_mean"] > args.line_mean:
|
||||
return False
|
||||
elif example["alpha_frac"] < args.alpha_frac:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def compress_file(file_path):
|
||||
"""Compress a file with g-zip."""
|
||||
with open(file_path, "rb") as f_in:
|
||||
with gzip.open(file_path + ".gz", "wb", compresslevel=6) as f_out:
|
||||
shutil.copyfileobj(f_in, f_out)
|
||||
os.unlink(file_path)
|
||||
|
||||
|
||||
# Settings
|
||||
parser = HfArgumentParser(PreprocessingArguments)
|
||||
args = parser.parse_args()
|
||||
if args.num_workers is None:
|
||||
args.num_workers = multiprocessing.cpu_count()
|
||||
|
||||
# Load dataset
|
||||
t_start = time.time()
|
||||
ds = load_dataset(args.dataset_name, split="train")
|
||||
print(f"Time to load dataset: {time.time()-t_start:.2f}")
|
||||
|
||||
# Run preprocessing
|
||||
t_start = time.time()
|
||||
ds = ds.map(preprocess, num_proc=args.num_workers)
|
||||
print(f"Time to preprocess dataset: {time.time()-t_start:.2f}")
|
||||
|
||||
# Deduplicate hashes
|
||||
uniques = set(ds.unique("hash"))
|
||||
frac = len(uniques) / len(ds)
|
||||
print(f"Fraction of duplicates: {1-frac:.2%}")
|
||||
|
||||
# Deduplicate data and apply heuristics
|
||||
t_start = time.time()
|
||||
ds_filter = ds.filter(filter, fn_kwargs={"uniques": uniques, "args": args})
|
||||
print(f"Time to filter dataset: {time.time()-t_start:.2f}")
|
||||
print(f"Size of filtered dataset: {len(ds_filter)}")
|
||||
|
||||
# Save data in batches of samples_per_file
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
t_start = time.time()
|
||||
for file_number, index in enumerate(range(0, len(ds_filter), args.samples_per_file)):
|
||||
file_path = f"{args.output_dir}/file-{file_number+1:012}.json"
|
||||
end_index = min(len(ds_filter), index + args.samples_per_file)
|
||||
ds_filter.select(list(range(index, end_index))).to_json(file_path)
|
||||
compress_file(file_path)
|
||||
print(f"Time to save dataset: {time.time()-t_start:.2f}")
|
||||
@@ -0,0 +1,99 @@
|
||||
import logging
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import IterableDataset
|
||||
from torch.utils.data.dataloader import DataLoader
|
||||
|
||||
from accelerate import Accelerator
|
||||
from arguments import EvaluationArguments
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed
|
||||
|
||||
|
||||
class ConstantLengthDataset(IterableDataset):
|
||||
def __init__(self, tokenizer, dataset, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6):
|
||||
self.tokenizer = tokenizer
|
||||
self.concat_token_id = tokenizer.bos_token_id
|
||||
self.dataset = dataset
|
||||
self.seq_length = seq_length
|
||||
self.input_characters = seq_length * chars_per_token * num_of_sequences
|
||||
|
||||
def __iter__(self):
|
||||
iterator = iter(self.dataset)
|
||||
more_examples = True
|
||||
while more_examples:
|
||||
buffer, buffer_len = [], 0
|
||||
while True:
|
||||
if buffer_len >= self.input_characters:
|
||||
break
|
||||
try:
|
||||
buffer.append(next(iterator)["content"])
|
||||
buffer_len += len(buffer[-1])
|
||||
except StopIteration:
|
||||
more_examples = False
|
||||
break
|
||||
tokenized_inputs = tokenizer(buffer, truncation=False)["input_ids"]
|
||||
all_token_ids = []
|
||||
for tokenized_input in tokenized_inputs:
|
||||
all_token_ids.extend(tokenized_input + [self.concat_token_id])
|
||||
for i in range(0, len(all_token_ids), self.seq_length):
|
||||
input_ids = all_token_ids[i : i + self.seq_length]
|
||||
if len(input_ids) == self.seq_length:
|
||||
yield torch.tensor(input_ids)
|
||||
|
||||
|
||||
def create_dataloader(args):
|
||||
ds_kwargs = {"streaming": True}
|
||||
valid_data = load_dataset(args.dataset_name, split="train", **ds_kwargs)
|
||||
valid_dataset = ConstantLengthDataset(tokenizer, valid_data, seq_length=args.seq_length)
|
||||
eval_dataloader = DataLoader(valid_dataset, batch_size=args.batch_size)
|
||||
return eval_dataloader
|
||||
|
||||
|
||||
def evaluate(args):
|
||||
model.eval()
|
||||
losses = []
|
||||
for step, batch in enumerate(eval_dataloader):
|
||||
with torch.no_grad():
|
||||
outputs = model(batch, labels=batch)
|
||||
loss = outputs.loss.repeat(args.batch_size)
|
||||
losses.append(accelerator.gather(loss))
|
||||
|
||||
if args.max_eval_steps > 0 and step >= args.max_eval_steps:
|
||||
break
|
||||
loss = torch.mean(torch.cat(losses))
|
||||
try:
|
||||
perplexity = torch.exp(loss)
|
||||
except OverflowError:
|
||||
perplexity = float("inf")
|
||||
return loss.item(), perplexity.item()
|
||||
|
||||
|
||||
# Setup Accelerator
|
||||
accelerator = Accelerator()
|
||||
|
||||
# Parse configuration
|
||||
parser = HfArgumentParser(EvaluationArguments)
|
||||
args = parser.parse_args()
|
||||
set_seed(args.seed)
|
||||
|
||||
# Logging
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
model = AutoModelForCausalLM.from_pretrained(args.model_ckpt)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
|
||||
|
||||
# Load dataset and dataloader
|
||||
eval_dataloader = create_dataloader(args)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
model, eval_dataloader = accelerator.prepare(model, eval_dataloader)
|
||||
|
||||
# Evaluate and save the last checkpoint
|
||||
logger.info("Evaluating and saving model after training")
|
||||
eval_loss, perplexity = evaluate(args)
|
||||
logger.info(f"loss/eval: {eval_loss}, perplexity: {perplexity}")
|
||||
Reference in New Issue
Block a user