Skip to content

Commit 66c99a0

Browse files
transform external bodyXML to content-tree in Go
1 parent 67054db commit 66c99a0

File tree

9 files changed

+1428
-52
lines changed

9 files changed

+1428
-52
lines changed

content_tree.go

Lines changed: 683 additions & 45 deletions
Large diffs are not rendered by default.

go.mod

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
11
module github.com/Financial-Times/content-tree
22

3-
go 1.23.5
3+
go 1.25
4+
5+
require github.com/beevik/etree v1.6.0
6+
7+
require github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
github.com/beevik/etree v1.6.0 h1:u8Kwy8pp9D9XeITj2Z0XtA5qqZEmtJtuXZRQi+j03eE=
2+
github.com/beevik/etree v1.6.0/go.mod h1:bh4zJxiIr62SOf9pRzN7UUYaEDa9HEKafK25+sLc0Gc=
3+
github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1 h1:dOYG7LS/WK00RWZc8XGgcUTlTxpp3mKhdR2Q9z9HbXM=
4+
github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1/go.mod h1:mpRZBD8SJ55OIICQ3iWH0Yz3cjzA61JdqMLoWXeB2+8=
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# XML to Content Tree Transformer
2+
3+
## Overview
4+
The Transformer converts external XHTML-formatted document into content tree.
5+
It supports format stored in the **internalComponent** collection as well as the one returned by the **Internal Content API**.
6+
The latter is produced by the content-public-read service after applying certain transformations to the bodyXML it retrieves from the internalComponents collection.
7+
These transformations include renaming the content, related, and concept tags to ft-content, ft-related, and ft-concept, respectively, and replacing the id attribute with url, with a few caveats.
8+
9+
## Usage
10+
11+
```go
12+
package main
13+
14+
import (
15+
"fmt"
16+
"log"
17+
18+
tocontenttree "github.com/Financial-Times/content-tree"
19+
)
20+
21+
func main() {
22+
xmlInput := `<body><p>Hello World</p></body>`
23+
24+
out, err := tocontenttree.Transform(xmlInput)
25+
if err != nil {
26+
log.Fatalf("Transform (XmlToTree) failed: %v", err)
27+
}
28+
29+
fmt.Printf("Transformed content tree: %+v\n", out)
30+
}
31+
```
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package tocontenttree
2+
3+
import (
4+
"strings"
5+
6+
"github.com/beevik/etree"
7+
)
8+
9+
type layoutwidth string
10+
11+
func toValidLayoutWidth(w string) layoutwidth {
12+
switch w {
13+
case "auto", "in-line", "inset-left", "inset-right",
14+
"full-bleed", "full-grid", "mid-grid", "full-width":
15+
return layoutwidth(w)
16+
default:
17+
return "full-width"
18+
}
19+
}
20+
21+
func toValidClipLayoutWidth(w string) layoutwidth {
22+
switch w {
23+
case "in-line", "full-grid", "mid-grid":
24+
return layoutwidth(w)
25+
default:
26+
return "in-line"
27+
}
28+
29+
}
30+
31+
func findChild(el *etree.Element, tag string) *etree.Element {
32+
for _, ch := range el.ChildElements() {
33+
if ch.Tag == tag {
34+
return ch
35+
}
36+
if found := findChild(ch, tag); found != nil {
37+
return found
38+
}
39+
}
40+
return nil
41+
}
42+
43+
func textContent(el *etree.Element) string {
44+
var b strings.Builder
45+
for _, tok := range el.Child {
46+
switch t := tok.(type) {
47+
case *etree.CharData:
48+
b.WriteString(t.Data)
49+
case *etree.Element:
50+
b.WriteString(textContent(t))
51+
}
52+
}
53+
return b.String()
54+
}
55+
56+
func flattenedChildren(el *etree.Element) []etree.Token {
57+
out := make([]etree.Token, 0, len(el.Child))
58+
for _, tok := range el.Child {
59+
if d, ok := tok.(*etree.Element); ok && d.Tag == "div" {
60+
out = append(out, d.Child...)
61+
} else {
62+
out = append(out, tok)
63+
}
64+
}
65+
return out
66+
}
67+
68+
func valueOr(v, fallback string) string {
69+
if v != "" {
70+
return v
71+
}
72+
return fallback
73+
}
74+
75+
func attr(el *etree.Element, name string) string {
76+
return el.SelectAttrValue(name, "")
77+
}
78+
79+
var contentTypeTemplates = map[string]string{
80+
"http://www.ft.com/ontology/content/Article": "/content/{{id}}",
81+
"http://www.ft.com/ontology/content/ImageSet": "/content/{{id}}",
82+
"http://www.ft.com/ontology/content/ClipSet": "/content/{{id}}",
83+
"http://www.ft.com/ontology/content/CustomCodeComponent": "/content/{{id}}",
84+
"http://www.ft.com/ontology/content/MediaResource": "/content/{{id}}",
85+
"http://www.ft.com/ontology/content/Video": "/content/{{id}}",
86+
"http://www.ft.com/ontology/company/PublicCompany": "/organisations/{{id}}",
87+
"http://www.ft.com/ontology/content/ContentPackage": "/content/{{id}}",
88+
"http://www.ft.com/ontology/content/Content": "/content/{{id}}",
89+
"http://www.ft.com/ontology/content/Image": "/content/{{id}}",
90+
"http://www.ft.com/ontology/content/DynamicContent": "/content/{{id}}",
91+
"http://www.ft.com/ontology/content/Graphic": "/content/{{id}}",
92+
"http://www.ft.com/ontology/content/Audio": "/content/{{id}}",
93+
"http://www.ft.com/ontology/company/Organisation": "/organisations/{{id}}",
94+
}
95+
96+
func generateUrl(t, id string) string {
97+
const host = "http://api.ft.com"
98+
template, ok := contentTypeTemplates[t]
99+
if !ok {
100+
return ""
101+
}
102+
path := strings.Replace(template, "{{id}}", id, 1)
103+
return host + path
104+
}

0 commit comments

Comments
 (0)