Add POS tagging and Phrase chunking token classification examples (#6457)

* Add more token classification examples

* POS tagging example

* Phrase chunking example

* PR review fixes

* Add conllu to third party list (used in token classification examples)
This commit is contained in:
vblagoje
2020-08-13 12:09:51 -04:00
committed by GitHub
parent f51161e230
commit eda07efaa5
10 changed files with 473 additions and 204 deletions

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env bash
if ! [ -f ./dev.txt ]; then
echo "Download dev dataset...."
curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
fi
if ! [ -f ./test.txt ]; then
echo "Download test dataset...."
curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
fi
if ! [ -f ./train.txt ]; then
echo "Download train dataset...."
curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
fi
export MAX_LENGTH=200
export BERT_MODEL=bert-base-uncased
export OUTPUT_DIR=postagger-model
export BATCH_SIZE=32
export NUM_EPOCHS=3
export SAVE_STEPS=750
export SEED=1
# Add parent directory to python path to access lightning_base.py
export PYTHONPATH="../":"${PYTHONPATH}"
python3 run_pl_ner.py --data_dir ./ \
--task_type POS \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--gpus 1 \
--do_train \
--do_predict