From 48bf7e47a01f310ca8e76bd90be14e06dbd08329 Mon Sep 17 00:00:00 2001 From: Nathan Cooper Date: Mon, 13 Dec 2021 03:30:50 -0500 Subject: [PATCH] Code parrot minor fixes/niceties (#14666) * Add some nicety flags for better controlling evaluation. * Fix dependency issue with outdated requirement * Add additional flag to example to ensure eval is done * Wrap code into main function for accelerate launcher to find * Fix valid batch size flag in readme * Add note to install git-lfs when initializing/training the model * Update examples/research_projects/codeparrot/scripts/arguments.py Co-authored-by: Leandro von Werra * Update examples/research_projects/codeparrot/README.md Co-authored-by: Leandro von Werra * Revert "Wrap code into main function for accelerate launcher to find" This reverts commit ff11df1c810d4df198d04b827538eb4572147ba3. * Fix formatting issue * Move git-lfs instructions to installation section * Add a quick check before code generation for code evaluation * Fix styling issue * Update examples/research_projects/codeparrot/scripts/human_eval.py Co-authored-by: Leandro von Werra * Make iterable dataset use passed in tokenizer rather than globally defined one Co-authored-by: Leandro von Werra Co-authored-by: ncoop57 --- examples/research_projects/codeparrot/README.md | 7 +++++-- .../research_projects/codeparrot/requirements.txt | 2 +- .../codeparrot/scripts/arguments.py | 10 ++++++++++ .../codeparrot/scripts/codeparrot_training.py | 2 +- .../codeparrot/scripts/human_eval.py | 13 +++++++++++-- 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md index 858334f2f2..cf0b99345c 100644 --- a/examples/research_projects/codeparrot/README.md +++ b/examples/research_projects/codeparrot/README.md @@ -31,6 +31,8 @@ Before you run any of the scripts make sure you are logged in and can push to th huggingface-cli login ``` +Additionally, sure you have git-lfs installed. You can find instructions for how to install it [here](https://git-lfs.github.com/). + ## Dataset The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot). @@ -96,7 +98,7 @@ If you want to train the small model you need to make some modifications: accelerate launch scripts/codeparrot_training.py \ --model_ckpt lvwerra/codeparrot-small \ --train_batch_size 12 \ ---eval_batch_size 12 \ +--valid_batch_size 12 \ --learning_rate 5e-4 \ --num_warmup_steps 2000 \ --gradient_accumulation 1 \ @@ -125,7 +127,8 @@ python scripts/human_eval.py --model_ckpt lvwerra/codeparrot \ --do_sample True \ --temperature 0.2 \ --top_p 0.95 \ ---n_samples=200 +--n_samples=200 \ +--HF_ALLOW_CODE_EVAL="0" ``` The results as well as reference values are shown in the following table: diff --git a/examples/research_projects/codeparrot/requirements.txt b/examples/research_projects/codeparrot/requirements.txt index 7f1b864de2..3333e3dc37 100644 --- a/examples/research_projects/codeparrot/requirements.txt +++ b/examples/research_projects/codeparrot/requirements.txt @@ -4,4 +4,4 @@ accelerate==0.5.1 wandb==0.12.0 tensorboard==2.6.0 torch==1.9.0 -huggingface-hub==0.0.19 \ No newline at end of file +huggingface-hub==0.1.0 \ No newline at end of file diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py index 894fc11604..ee4a23ec29 100644 --- a/examples/research_projects/codeparrot/scripts/arguments.py +++ b/examples/research_projects/codeparrot/scripts/arguments.py @@ -83,6 +83,10 @@ class HumanEvalArguments: metadata={"help": "Model name or path of model to be evaluated."}, ) num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."}) + num_tasks: Optional[int] = field( + default=None, + metadata={"help": "The number of human-eval tasks to run. If not included all tasks are evaluated."}, + ) do_sample: Optional[bool] = field( default=True, metadata={"help": "Sample from the language model's output distribution."} ) @@ -101,6 +105,12 @@ class HumanEvalArguments: HF_ALLOW_CODE_EVAL: Optional[str] = field( default="0", metadata={"help": "Allow `code_eval` to execute Python code on machine"} ) + device_int: Optional[int] = field( + default=-1, + metadata={ + "help": "Determine which device to run the `text-generation` Pipeline on. -1 is CPU and any zero or positive number corresponds to which GPU device id to run on." + }, + ) @dataclass diff --git a/examples/research_projects/codeparrot/scripts/codeparrot_training.py b/examples/research_projects/codeparrot/scripts/codeparrot_training.py index 771109b1d3..9cc42d3881 100644 --- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py +++ b/examples/research_projects/codeparrot/scripts/codeparrot_training.py @@ -59,7 +59,7 @@ class ConstantLengthDataset(IterableDataset): else: more_examples = False break - tokenized_inputs = tokenizer(buffer, truncation=False)["input_ids"] + tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"] all_token_ids = [] for tokenized_input in tokenized_inputs: all_token_ids.extend(tokenized_input + [self.concat_token_id]) diff --git a/examples/research_projects/codeparrot/scripts/human_eval.py b/examples/research_projects/codeparrot/scripts/human_eval.py index 6812c8e7f0..d70655b996 100644 --- a/examples/research_projects/codeparrot/scripts/human_eval.py +++ b/examples/research_projects/codeparrot/scripts/human_eval.py @@ -51,14 +51,23 @@ def main(): # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) model = AutoModelForCausalLM.from_pretrained(args.model_ckpt) - pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=args.device_int) # Load evaluation dataset and metric human_eval = load_dataset("openai_humaneval") code_eval_metric = load_metric("code_eval") + # Run a quick test to see if code evaluation is enabled + try: + _ = code_eval_metric.compute(references=[""], predictions=[[""]]) + except ValueError as exception: + print( + 'Code evaluation not enabled. Read the warning below carefully and then use `--HF_ALLOW_CODE_EVAL="1"` flag to enable code evaluation.' + ) + raise exception + # Generate completions for evaluation set - n_tasks = 4 # len(human_eval["test"]) + n_tasks = args.num_tasks if args.num_tasks is not None else len(human_eval["test"]) generations, references = [], [] for task in tqdm(range(n_tasks)): task_generations = []