Skip to content

Commit 9455e2b

Browse files
authored
Atom: use correct xml:base for decoded elements (#222)
* Atom: use correct xml:base for decoded elements In order to keep tracking xml:base correctly, the goxpp's `DecodeElement` pops the BaseStack if the start element added a base (if any). That means the atom parser needs keep track of the base *before* calling `DecodeElement` to use for resolving relative URLs within the decoded element. Without this fix, elements with xml:base attributes will be erroneously resolved with the parent xml:base. * Depend on updated goxpp version without xml:base bug * Resolve xml:base URLs without switching out the BaseStack This provides an equivalent fix that doesn't do any inelegant swapping out of the BaseStack. It also doesn't change `goxpp`'s public API by essentially copying `XmlBaseResolveUrl` to `gofeed`.
1 parent 8340fbd commit 9455e2b

File tree

4 files changed

+34
-11
lines changed

4 files changed

+34
-11
lines changed

atom/parser.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,8 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
658658
InnerXML string `xml:",innerxml"`
659659
}
660660

661+
// get current base URL before it is clobbered by DecodeElement
662+
base := p.BaseStack.Top()
661663
err := p.DecodeElement(&text)
662664
if err != nil {
663665
return "", err
@@ -672,7 +674,7 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
672674
if strings.Contains(result, "<![CDATA[") {
673675
result = shared.StripCDATA(result)
674676
if lowerType == "html" || strings.Contains(lowerType, "xhtml") {
675-
result, _ = shared.ResolveHTML(p, result)
677+
result, _ = shared.ResolveHTML(base, result)
676678
}
677679
} else {
678680
// decode non-CDATA contents depending on type
@@ -683,12 +685,12 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
683685
result, err = shared.DecodeEntities(result)
684686
} else if strings.Contains(lowerType, "xhtml") {
685687
result = ap.stripWrappingDiv(result)
686-
result, _ = shared.ResolveHTML(p, result)
688+
result, _ = shared.ResolveHTML(base, result)
687689
} else if lowerType == "html" {
688690
result = ap.stripWrappingDiv(result)
689691
result, err = shared.DecodeEntities(result)
690692
if err == nil {
691-
result, _ = shared.ResolveHTML(p, result)
693+
result, _ = shared.ResolveHTML(base, result)
692694
}
693695
} else {
694696
decodedStr, err := base64.StdEncoding.DecodeString(result)
@@ -701,7 +703,7 @@ func (ap *Parser) parseAtomText(p *xpp.XMLPullParser) (string, error) {
701703
// resolve relative URIs in URI-containing elements according to xml:base
702704
name := strings.ToLower(p.Name)
703705
if atomUriElements[name] {
704-
resolved, err := p.XmlBaseResolveUrl(result)
706+
resolved, err := shared.XmlBaseResolveUrl(base, result)
705707
if resolved != nil && err == nil {
706708
result = resolved.String()
707709
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ go 1.19
55
require (
66
github.com/PuerkitoBio/goquery v1.8.0
77
github.com/json-iterator/go v1.1.12
8-
github.com/mmcdole/goxpp v1.1.0
8+
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23
99
github.com/stretchr/testify v1.8.1
1010
github.com/urfave/cli v1.22.3
1111
golang.org/x/net v0.4.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs
1111
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
1212
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
1313
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
14-
github.com/mmcdole/goxpp v1.1.0 h1:WwslZNF7KNAXTFuzRtn/OKZxFLJAAyOA9w82mDz2ZGI=
15-
github.com/mmcdole/goxpp v1.1.0/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
14+
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
15+
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23/go.mod h1:v+25+lT2ViuQ7mVxcncQ8ch1URund48oH+jhjiwEgS8=
1616
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
1717
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
1818
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=

internal/shared/xmlbase.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package shared
33
import (
44
"bytes"
55
"fmt"
6+
"net/url"
67
"strings"
78

89
xpp "github.com/mmcdole/goxpp"
@@ -83,7 +84,7 @@ func resolveAttrs(p *xpp.XMLPullParser) error {
8384
for i, attr := range p.Attrs {
8485
lowerName := strings.ToLower(attr.Name.Local)
8586
if uriAttrs[lowerName] {
86-
absURL, err := p.XmlBaseResolveUrl(attr.Value)
87+
absURL, err := XmlBaseResolveUrl(p.BaseStack.Top(), attr.Value)
8788
if err != nil {
8889
return err
8990
}
@@ -95,11 +96,31 @@ func resolveAttrs(p *xpp.XMLPullParser) error {
9596
return nil
9697
}
9798

99+
// resolve u relative to b
100+
func XmlBaseResolveUrl(b *url.URL, u string) (*url.URL, error) {
101+
relURL, err := url.Parse(u)
102+
if err != nil {
103+
return nil, err
104+
}
105+
106+
if b == nil {
107+
return relURL, nil
108+
}
109+
110+
if b.Path != "" && u != "" && b.Path[len(b.Path)-1] != '/' {
111+
// There's no reason someone would use a path in xml:base if they
112+
// didn't mean for it to be a directory
113+
b.Path = b.Path + "/"
114+
}
115+
absURL := b.ResolveReference(relURL)
116+
return absURL, nil
117+
}
118+
98119
// Transforms html by resolving any relative URIs in attributes
99120
// if an error occurs during parsing or serialization, then the original string
100121
// is returned along with the error.
101-
func ResolveHTML(p *xpp.XMLPullParser, relHTML string) (string, error) {
102-
if p.BaseStack.Top() == nil {
122+
func ResolveHTML(base *url.URL, relHTML string) (string, error) {
123+
if base == nil {
103124
return relHTML, nil
104125
}
105126

@@ -117,7 +138,7 @@ func ResolveHTML(p *xpp.XMLPullParser, relHTML string) (string, error) {
117138
if n.Type == html.ElementNode {
118139
for i, a := range n.Attr {
119140
if htmlURIAttrs[a.Key] {
120-
absVal, err := p.XmlBaseResolveUrl(a.Val)
141+
absVal, err := XmlBaseResolveUrl(base, a.Val)
121142
if absVal != nil && err == nil {
122143
n.Attr[i].Val = absVal.String()
123144
}

0 commit comments

Comments
 (0)