From 48bf7e47a01f310ca8e76bd90be14e06dbd08329 Mon Sep 17 00:00:00 2001
From: Nathan Cooper <nacooper01@email.wm.edu>
Date: Mon, 13 Dec 2021 03:30:50 -0500
Subject: [PATCH] Code parrot minor fixes/niceties (#14666)

* Add some nicety flags for better controlling evaluation.

* Fix dependency issue with outdated requirement

* Add additional flag to example to ensure eval is done

* Wrap code into main function for accelerate launcher to find

* Fix valid batch size flag in readme

* Add note to install git-lfs when initializing/training the model

* Update examples/research_projects/codeparrot/scripts/arguments.py

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* Update examples/research_projects/codeparrot/README.md

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* Revert "Wrap code into main function for accelerate launcher to find"

This reverts commit ff11df1c810d4df198d04b827538eb4572147ba3.

* Fix formatting issue

* Move git-lfs instructions to installation section

* Add a quick check before code generation for code evaluation

* Fix styling issue

* Update examples/research_projects/codeparrot/scripts/human_eval.py

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>

* Make iterable dataset use passed in tokenizer rather than globally defined one

Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
Co-authored-by: ncoop57 <nac33@students.uwf.edu>
---
 examples/research_projects/codeparrot/README.md     |  7 +++++--
 .../research_projects/codeparrot/requirements.txt   |  2 +-
 .../codeparrot/scripts/arguments.py                 | 10 ++++++++++
 .../codeparrot/scripts/codeparrot_training.py       |  2 +-
 .../codeparrot/scripts/human_eval.py                | 13 +++++++++++--
 5 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index 858334f2f2..cf0b99345c 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -31,6 +31,8 @@ Before you run any of the scripts make sure you are logged in and can push to th
 huggingface-cli login
 ```
 
+Additionally, sure you have git-lfs installed. You can find instructions for how to install it [here](https://git-lfs.github.com/).
+
 ## Dataset
 The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot).
 
@@ -96,7 +98,7 @@ If you want to train the small model you need to make some modifications:
 accelerate launch scripts/codeparrot_training.py \
 --model_ckpt lvwerra/codeparrot-small \
 --train_batch_size 12 \
---eval_batch_size 12 \
+--valid_batch_size 12 \
 --learning_rate 5e-4 \
 --num_warmup_steps 2000 \
 --gradient_accumulation 1 \
@@ -125,7 +127,8 @@ python scripts/human_eval.py --model_ckpt lvwerra/codeparrot \
 --do_sample True \
 --temperature 0.2 \
 --top_p 0.95 \
---n_samples=200
+--n_samples=200 \
+--HF_ALLOW_CODE_EVAL="0"
 ```
 
 The results as well as reference values are shown in the following table:
diff --git a/examples/research_projects/codeparrot/requirements.txt b/examples/research_projects/codeparrot/requirements.txt
index 7f1b864de2..3333e3dc37 100644
--- a/examples/research_projects/codeparrot/requirements.txt
+++ b/examples/research_projects/codeparrot/requirements.txt
@@ -4,4 +4,4 @@ accelerate==0.5.1
 wandb==0.12.0
 tensorboard==2.6.0
 torch==1.9.0
-huggingface-hub==0.0.19
\ No newline at end of file
+huggingface-hub==0.1.0
\ No newline at end of file
diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py
index 894fc11604..ee4a23ec29 100644
--- a/examples/research_projects/codeparrot/scripts/arguments.py
+++ b/examples/research_projects/codeparrot/scripts/arguments.py
@@ -83,6 +83,10 @@ class HumanEvalArguments:
         metadata={"help": "Model name or path of model to be evaluated."},
     )
     num_workers: Optional[int] = field(default=None, metadata={"help": "Number of workers used for code evaluation."})
+    num_tasks: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of human-eval tasks to run. If not included all tasks are evaluated."},
+    )
     do_sample: Optional[bool] = field(
         default=True, metadata={"help": "Sample from the language model's output distribution."}
     )
@@ -101,6 +105,12 @@ class HumanEvalArguments:
     HF_ALLOW_CODE_EVAL: Optional[str] = field(
         default="0", metadata={"help": "Allow `code_eval` to execute Python code on machine"}
     )
+    device_int: Optional[int] = field(
+        default=-1,
+        metadata={
+            "help": "Determine which device to run the `text-generation` Pipeline on. -1 is CPU and any zero or positive number corresponds to which GPU device id to run on."
+        },
+    )
 
 
 @dataclass
diff --git a/examples/research_projects/codeparrot/scripts/codeparrot_training.py b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
index 771109b1d3..9cc42d3881 100644
--- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py
+++ b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
@@ -59,7 +59,7 @@ class ConstantLengthDataset(IterableDataset):
                     else:
                         more_examples = False
                         break
-            tokenized_inputs = tokenizer(buffer, truncation=False)["input_ids"]
+            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
             all_token_ids = []
             for tokenized_input in tokenized_inputs:
                 all_token_ids.extend(tokenized_input + [self.concat_token_id])
diff --git a/examples/research_projects/codeparrot/scripts/human_eval.py b/examples/research_projects/codeparrot/scripts/human_eval.py
index 6812c8e7f0..d70655b996 100644
--- a/examples/research_projects/codeparrot/scripts/human_eval.py
+++ b/examples/research_projects/codeparrot/scripts/human_eval.py
@@ -51,14 +51,23 @@ def main():
     # Load model and tokenizer
     tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
     model = AutoModelForCausalLM.from_pretrained(args.model_ckpt)
-    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
+    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=args.device_int)
 
     # Load evaluation dataset and metric
     human_eval = load_dataset("openai_humaneval")
     code_eval_metric = load_metric("code_eval")
 
+    # Run a quick test to see if code evaluation is enabled
+    try:
+        _ = code_eval_metric.compute(references=[""], predictions=[[""]])
+    except ValueError as exception:
+        print(
+            'Code evaluation not enabled. Read the warning below carefully and then use `--HF_ALLOW_CODE_EVAL="1"` flag to enable code evaluation.'
+        )
+        raise exception
+
     # Generate completions for evaluation set
-    n_tasks = 4  # len(human_eval["test"])
+    n_tasks = args.num_tasks if args.num_tasks is not None else len(human_eval["test"])
     generations, references = [], []
     for task in tqdm(range(n_tasks)):
         task_generations = []