From f5658732d5bd38657b6cf3138f69373a94866c61 Mon Sep 17 00:00:00 2001
From: JINO ROHIT <find.jinorohit@gmail.com>
Date: Mon, 8 Apr 2024 17:19:59 +0530
Subject: [PATCH] fixing issue 30034 - adding data format for run_ner.py
 (#30088)

---
 examples/pytorch/token-classification/README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/examples/pytorch/token-classification/README.md b/examples/pytorch/token-classification/README.md
index 568e5242fe..b880b82030 100644
--- a/examples/pytorch/token-classification/README.md
+++ b/examples/pytorch/token-classification/README.md
@@ -25,6 +25,20 @@ customize it to your needs if you need extra processing on your datasets.
 It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
 training and validation, you might just need to add some tweaks in the data preprocessing.
 
+### Using your own data
+
+If you use your own data, the script expects the following format of the data -
+
+```bash
+{
+    "chunk_tags": [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17, 11, 21, 17, 11, 12, 12, 21, 22, 22, 13, 11, 0],
+    "id": "0",
+    "ner_tags": [0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    "pos_tags": [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7],
+    "tokens": ["The", "European", "Commission", "said", "on", "Thursday", "it", "disagreed", "with", "German", "advice", "to", "consumers", "to", "shun", "British", "lamb", "until", "scientists", "determine", "whether", "mad", "cow", "disease", "can", "be", "transmitted", "to", "sheep", "."]
+}
+```
+
 The following example fine-tunes BERT on CoNLL-2003:
 
 ```bash