Skip to content

Commit 244528e

Browse files
committed
Docx reader: properly handle media stored with packaged-rooted paths...
...such as /media. This can be produced by the online version of Word. (Desktop Word uses paths relative to word/.) With this change, media stored at an absolute path will be stored with its original name and not given a SHA1-based name. Closes #11518.
1 parent f0d05eb commit 244528e

File tree

1 file changed

+23
-15
lines changed

1 file changed

+23
-15
lines changed

src/Text/Pandoc/Readers/Docx/Parse.hs

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -540,30 +540,26 @@ filePathToRelType path docXmlPath =
540540
then Just InDocument
541541
else Nothing
542542

543-
relElemToRelationship :: FilePath -> DocumentLocation -> Element
543+
relElemToRelationship :: DocumentLocation -> Element
544544
-> Maybe Relationship
545-
relElemToRelationship fp relType element | qName (elName element) == "Relationship" =
545+
relElemToRelationship relType element | qName (elName element) == "Relationship" =
546546
do
547547
relId <- findAttr (QName "Id" Nothing Nothing) element
548548
target <- findAttr (QName "Target" Nothing Nothing) element
549-
-- target may be relative (media/image1.jpeg) or absolute
550-
-- (/word/media/image1.jpeg); we need to relativize it (see #7374)
551-
let frontOfFp = T.pack $ takeWhile (/= '_') fp
552-
let target' = fromMaybe target $
553-
T.stripPrefix frontOfFp $ T.dropWhile (== '/') target
554-
return $ Relationship relType relId target'
555-
relElemToRelationship _ _ _ = Nothing
549+
-- target may be relative (media/image1.jpeg) or absolute (/word/media/image1.jpg)
550+
return $ Relationship relType relId target
551+
relElemToRelationship _ _ = Nothing
556552

557553
extractTarget :: Element -> Maybe Target
558-
extractTarget element = do (Relationship _ _ target) <- relElemToRelationship "word/" InDocument element
554+
extractTarget element = do (Relationship _ _ target) <- relElemToRelationship InDocument element
559555
return target
560556

561557
filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship]
562558
filePathToRelationships ar docXmlPath fp
563559
| Just relType <- filePathToRelType fp docXmlPath
564560
, Just entry <- findEntryByPath fp ar
565561
, Just relElems <- parseXMLFromEntry entry =
566-
mapMaybe (relElemToRelationship fp relType) $ elChildren relElems
562+
mapMaybe (relElemToRelationship relType) $ elChildren relElems
567563
filePathToRelationships _ _ _ = []
568564

569565
archiveToRelationships :: Archive -> FilePath -> [Relationship]
@@ -934,11 +930,23 @@ expandDrawingId s = do
934930
case target of
935931
Just filepath -> do
936932
media <- asks envMedia
937-
let filepath' = case filepath of
938-
('/':rest) -> rest
939-
_ -> "word/" ++ filepath
933+
-- the mediaName is the name we store it under in the mediabag.
934+
-- This is derived from the path, which might be absolute, e.g. /media/foo.jpg,
935+
-- or relative, media/foo.jpg (interpreted as /word/media/foo.jpg).
936+
-- The mediaName will strip off any leading `/` or `word/`.
937+
-- We assume here that the media will not be stored *both* in
938+
-- /media and in /word/media, which would lead to a name conflict
939+
-- given the scheme here for generating a mediaName.
940+
let (filepath', mediaName) =
941+
case filepath of
942+
('/':rest) -- absolute path e.g. /media/foo.jpg
943+
-> (rest, case stripPrefix "word/" rest of
944+
Just rest' -> rest'
945+
Nothing -> rest)
946+
_ -- rel path to word, e.g. media/foo.jpg
947+
-> ("word/" ++ filepath, filepath)
940948
case lookup filepath' media of
941-
Just bs -> return (filepath, bs)
949+
Just bs -> return (mediaName, bs)
942950
Nothing -> throwError DocxError
943951
Nothing -> throwError DocxError
944952

0 commit comments

Comments
 (0)