From f90bc44d9a8f16862b5118cca06ab3e997b6cdce Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Tue, 9 Jun 2020 17:38:28 -0400 Subject: [PATCH] [examples] Cleanup summarization docs (#4876) --- examples/summarization/README.md | 28 +--------------- .../summarization/download_cnn_daily_mail.py | 32 ------------------- examples/summarization/finetune_bart.sh | 2 +- examples/summarization/finetune_bart_tiny.sh | 2 +- examples/summarization/finetune_t5.sh | 2 +- 5 files changed, 4 insertions(+), 62 deletions(-) delete mode 100644 examples/summarization/download_cnn_daily_mail.py diff --git a/examples/summarization/README.md b/examples/summarization/README.md index 730d29400c..ad4adc4b55 100644 --- a/examples/summarization/README.md +++ b/examples/summarization/README.md @@ -1,7 +1,4 @@ ### Get CNN Data -Both types of models do require CNN data and follow different procedures of obtaining so. - -#### For BART models To be able to reproduce the authors' results on the CNN/Daily Mail dataset you first need to download both CNN and Daily Mail datasets [from Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the links next to "Stories") in the same folder. Then uncompress the archives by running: ```bash @@ -12,40 +9,17 @@ tar -xzvf cnn_dm.tgz this should make a directory called cnn_dm/ with files like `test.source`. To use your own data, copy that files format. Each article to be summarized is on its own line. -#### For T5 models -First, you need to download the CNN data. It's about ~400 MB and can be downloaded by -running - -```bash -python download_cnn_daily_mail.py cnn_articles_input_data.txt cnn_articles_reference_summaries.txt -``` - -You should confirm that each file has 11490 lines: - -```bash -wc -l cnn_articles_input_data.txt # should print 11490 -wc -l cnn_articles_reference_summaries.txt # should print 11490 -``` - ### Evaluation To create summaries for each article in dataset, run: ```bash -python evaluate_cnn.py test_generations.txt +python evaluate_cnn.py test_generations.txt --score_path rouge_scores.txt ``` The default batch size, 8, fits in 16GB GPU memory, but may need to be adjusted to fit your system. ### Training Run/modify `finetune_bart.sh` or `finetune_t5.sh` -## (WIP) Rouge Scores - -To create summaries for each article in dataset and also calculate rouge scores run: -```bash -python evaluate_cnn.py test_generations.txt --reference_path --score_path -``` -The rouge scores "rouge1, rouge2, rougeL" are automatically created and saved in ````. - ### Stanford CoreNLP Setup ``` ptb_tokenize () { diff --git a/examples/summarization/download_cnn_daily_mail.py b/examples/summarization/download_cnn_daily_mail.py deleted file mode 100644 index 4fd0edfa98..0000000000 --- a/examples/summarization/download_cnn_daily_mail.py +++ /dev/null @@ -1,32 +0,0 @@ -# -*- coding: utf-8 -*- -import argparse -from pathlib import Path - -import tensorflow_datasets as tfds - - -def main(input_path, reference_path, data_dir): - cnn_ds = tfds.load("cnn_dailymail", split="test", shuffle_files=False, data_dir=data_dir) - cnn_ds_iter = tfds.as_numpy(cnn_ds) - - test_articles_file = Path(input_path).open("w", encoding="utf-8") - test_summaries_file = Path(reference_path).open("w", encoding="utf-8") - - for example in cnn_ds_iter: - test_articles_file.write(example["article"].decode("utf-8") + "\n") - test_articles_file.flush() - test_summaries_file.write(example["highlights"].decode("utf-8").replace("\n", " ") + "\n") - test_summaries_file.flush() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("input_path", type=str, help="where to save the articles input data") - parser.add_argument( - "reference_path", type=str, help="where to save the reference summaries", - ) - parser.add_argument( - "--data_dir", type=str, default="~/tensorflow_datasets", help="where to save the tensorflow datasets.", - ) - args = parser.parse_args() - main(args.input_path, args.reference_path, args.data_dir) diff --git a/examples/summarization/finetune_bart.sh b/examples/summarization/finetune_bart.sh index 608047fca3..b37888f5f4 100644 --- a/examples/summarization/finetune_bart.sh +++ b/examples/summarization/finetune_bart.sh @@ -6,7 +6,7 @@ export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} mkdir -p $OUTPUT_DIR # Add parent directory to python path to access lightning_base.py -export PYTHONPATH="../../":"${PYTHONPATH}" +export PYTHONPATH="../":"${PYTHONPATH}" python finetune.py \ --data_dir=./cnn-dailymail/cnn_dm \ diff --git a/examples/summarization/finetune_bart_tiny.sh b/examples/summarization/finetune_bart_tiny.sh index b04bf40264..6799e9bb32 100644 --- a/examples/summarization/finetune_bart_tiny.sh +++ b/examples/summarization/finetune_bart_tiny.sh @@ -13,7 +13,7 @@ export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} mkdir -p $OUTPUT_DIR # Add parent directory to python path to access lightning_base.py and utils.py -export PYTHONPATH="../../":"${PYTHONPATH}" +export PYTHONPATH="../":"${PYTHONPATH}" python finetune.py \ --data_dir=cnn_tiny/ \ --model_type=bart \ diff --git a/examples/summarization/finetune_t5.sh b/examples/summarization/finetune_t5.sh index 37d356e19d..6f831fae2b 100644 --- a/examples/summarization/finetune_t5.sh +++ b/examples/summarization/finetune_t5.sh @@ -6,7 +6,7 @@ export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME} mkdir -p $OUTPUT_DIR # Add parent directory to python path to access lightning_base.py -export PYTHONPATH="../../":"${PYTHONPATH}" +export PYTHONPATH="../":"${PYTHONPATH}" python finetune.py \ --data_dir=./cnn-dailymail/cnn_dm \