Skip to content

Commit f0d05eb

Browse files
authored
Docx reader: Recognize media inside textboxes (#11515)
Closes #11053.
1 parent d39ca21 commit f0d05eb

File tree

6 files changed

+94
-9
lines changed

6 files changed

+94
-9
lines changed

src/Text/Pandoc/Readers/Docx/Parse.hs

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -160,21 +160,32 @@ unwrapElement ns element
160160
| isElem ns "w" "smartTag" element
161161
= concatMap (unwrapElement ns) (elChildren element)
162162
| isElem ns "w" "p" element
163-
, textboxes@(_:_) <- findChildrenByName ns "w" "r" element >>=
164-
findChildrenByName ns "mc" "AlternateContent" >>=
165-
findChildrenByName ns "mc" "Fallback" >>=
166-
findChildrenByName ns "w" "pict" >>=
167-
(\e -> findChildrenByName ns "v" "shape" e <>
168-
findChildrenByName ns "v" "rect" e) >>=
169-
findChildrenByName ns "v" "textbox" >>=
170-
findChildrenByName ns "w" "txbxContent"
171-
= concatMap (unwrapElement ns) (concatMap elChildren textboxes) -- handle #9214
163+
, textboxes@(_:_) <- findChildrenByName ns "w" "r" element >>= findTextboxes
164+
= concatMap (unwrapElement ns) (concatMap elChildren textboxes) -- handle #9214, #11053
172165
| isElem ns "w" "r" element
173166
, Just fallback <- findChildByName ns "mc" "AlternateContent" element >>=
174167
findChildByName ns "mc" "Fallback"
175168
= [element{ elContent = concatMap (unwrapContent ns) (elContent fallback) }]
176169
| otherwise
177170
= [element{ elContent = concatMap (unwrapContent ns) (elContent element) }]
171+
where
172+
-- Search textbox content in the run's effective children.
173+
-- If AlternateContent is present, use only the fallback branch,
174+
-- matching the w:r unwrapping logic and avoiding duplicate textbox
175+
-- extraction when both direct and fallback encodings are present.
176+
findRunFallback run =
177+
findChildByName ns "mc" "AlternateContent" run >>=
178+
findChildByName ns "mc" "Fallback"
179+
findTextboxes run =
180+
findTextboxContent =<<
181+
case findRunFallback run of
182+
Just fallback -> findChildrenByName ns "w" "pict" fallback
183+
Nothing -> findChildrenByName ns "w" "pict" run
184+
findTextboxContent pict =
185+
(findChildrenByName ns "v" "shape" pict <>
186+
findChildrenByName ns "v" "rect" pict) >>=
187+
findChildrenByName ns "v" "textbox" >>=
188+
findChildrenByName ns "w" "txbxContent"
178189

179190
unwrapContent :: NameSpaces -> Content -> [Content]
180191
unwrapContent ns (Elem element) = map Elem $ unwrapElement ns element

test/Tests/Readers/Docx.hs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,14 @@ tests = [ testGroup "document"
215215
"text in shape format"
216216
"docx/text_in_shape_format.docx"
217217
"docx/text_in_shape_format.native"
218+
, testCompare
219+
"image inside textbox content"
220+
"docx/textbox_image.docx"
221+
"docx/textbox_image.native"
222+
, testCompare
223+
"image inside textbox content with duplicate encoding"
224+
"docx/textbox_image_duplicate_encoding.docx"
225+
"docx/textbox_image_duplicate_encoding.native"
218226
]
219227
, testGroup "blocks"
220228
[ testCompare
@@ -473,6 +481,12 @@ tests = [ testGroup "document"
473481
[ testMediaBag
474482
"image extraction"
475483
"docx/image.docx"
484+
, testMediaBag
485+
"image inside textbox content populates media bag"
486+
"docx/textbox_image.docx"
487+
, testMediaBag
488+
"image inside textbox content with duplicate encoding populates media bag"
489+
"docx/textbox_image_duplicate_encoding.docx"
476490
]
477491
, testGroup "custom styles"
478492
[ testCompare

test/docx/textbox_image.docx

39.4 KB
Binary file not shown.

test/docx/textbox_image.native

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[ Para
2+
[ Str "The"
3+
, Space
4+
, Str "image"
5+
, Space
6+
, Str "is"
7+
, Space
8+
, Str "below."
9+
]
10+
, Para
11+
[ Image
12+
( ""
13+
, []
14+
, [ ( "width" , "4.543038057742782in" )
15+
, ( "height" , "2.9166666666666665in" )
16+
]
17+
)
18+
[]
19+
( "media/image1.png" , "" )
20+
]
21+
, Para
22+
[ Str "The"
23+
, Space
24+
, Str "image"
25+
, Space
26+
, Str "is"
27+
, Space
28+
, Str "above."
29+
]
30+
]
35.9 KB
Binary file not shown.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[ Para
2+
[ Str "The"
3+
, Space
4+
, Str "image"
5+
, Space
6+
, Str "is"
7+
, Space
8+
, Str "below."
9+
]
10+
, Para
11+
[ Image
12+
( ""
13+
, []
14+
, [ ( "width" , "4.543038057742782in" )
15+
, ( "height" , "2.9166666666666665in" )
16+
]
17+
)
18+
[]
19+
( "media/image1.png" , "" )
20+
]
21+
, Para
22+
[ Str "The"
23+
, Space
24+
, Str "image"
25+
, Space
26+
, Str "is"
27+
, Space
28+
, Str "above."
29+
]
30+
]

0 commit comments

Comments
 (0)