From 7915a25976a6000f71b9197d963e56868f42e77e Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Thu, 18 Apr 2024 09:30:40 +0100 Subject: [PATCH] Fix donut token2json multiline (#30300) * Fix multiline processing * Update test for token2json --- src/transformers/models/donut/processing_donut.py | 4 +++- tests/models/donut/test_processing_donut.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 5636ecb943..1f03fd6306 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -149,7 +149,9 @@ class DonutProcessor(ProcessorMixin): end_token = end_token.group() start_token_escaped = re.escape(start_token) end_token_escaped = re.escape(end_token) - content = re.search(f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE) + content = re.search( + f"{start_token_escaped}(.*?){end_token_escaped}", tokens, re.IGNORECASE | re.DOTALL + ) if content is not None: content = content.group(1).strip() if r"GA30301123-4567" "Johnny" "JD" + "text\nwith\nnewlines" + "" ) actual_json = self.processor.token2json(sequence)