Fix donut token2json multiline (#30300)
* Fix multiline processing * Update test for token2json
This commit is contained in:
committed by
GitHub
parent
b65df514d1
commit
7915a25976
@@ -149,7 +149,9 @@ class DonutProcessor(ProcessorMixin):
|
||||
end_token = end_token.group()
|
||||
start_token_escaped = re.escape(start_token)
|
||||
end_token_escaped = re.escape(end_token)
|
||||
content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE)
|
||||
content = re.search(
|
||||
f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL
|
||||
)
|
||||
if content is not None:
|
||||
content = content.group(1).strip()
|
||||
if r"<s_" in content and r"</s_" in content: # non-leaf node
|
||||
|
||||
@@ -35,6 +35,8 @@ class DonutProcessorTest(unittest.TestCase):
|
||||
"zip": "30301",
|
||||
"phone": "123-4567",
|
||||
"nicknames": [{"nickname": "Johnny"}, {"nickname": "JD"}],
|
||||
"multiline": "text\nwith\nnewlines",
|
||||
"empty": "",
|
||||
}
|
||||
|
||||
sequence = (
|
||||
@@ -42,6 +44,8 @@ class DonutProcessorTest(unittest.TestCase):
|
||||
"<s_state>GA</s_state><s_zip>30301</s_zip><s_phone>123-4567</s_phone>"
|
||||
"<s_nicknames><s_nickname>Johnny</s_nickname>"
|
||||
"<sep/><s_nickname>JD</s_nickname></s_nicknames>"
|
||||
"<s_multiline>text\nwith\nnewlines</s_multiline>"
|
||||
"<s_empty></s_empty>"
|
||||
)
|
||||
actual_json = self.processor.token2json(sequence)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user