-
Notifications
You must be signed in to change notification settings - Fork 0
transform bodyXMLto external content-tree in Go #110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,7 @@ | ||
| module github.com/Financial-Times/content-tree | ||
|
|
||
| go 1.23.5 | ||
| go 1.25 | ||
|
|
||
| require github.com/beevik/etree v1.6.0 | ||
|
|
||
| require github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| github.com/beevik/etree v1.6.0 h1:u8Kwy8pp9D9XeITj2Z0XtA5qqZEmtJtuXZRQi+j03eE= | ||
| github.com/beevik/etree v1.6.0/go.mod h1:bh4zJxiIr62SOf9pRzN7UUYaEDa9HEKafK25+sLc0Gc= | ||
| github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1 h1:dOYG7LS/WK00RWZc8XGgcUTlTxpp3mKhdR2Q9z9HbXM= | ||
| github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1/go.mod h1:mpRZBD8SJ55OIICQ3iWH0Yz3cjzA61JdqMLoWXeB2+8= |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,31 @@ | ||
| # XML to Content Tree Transformer | ||
|
|
||
| ## Overview | ||
| The Transformer converts external XHTML-formatted document into content tree. | ||
| It supports format stored in the **internalComponent** collection as well as the one returned by the **Internal Content API**. | ||
| The latter is produced by the content-public-read service after applying certain transformations to the bodyXML it retrieves from the internalComponents collection. | ||
| These transformations include renaming the content, related, and concept tags to ft-content, ft-related, and ft-concept, respectively, and replacing the id attribute with url, with a few caveats. | ||
|
|
||
| ## Usage | ||
|
|
||
| ```go | ||
| package main | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "log" | ||
|
|
||
| tocontenttree "github.com/Financial-Times/content-tree" | ||
| ) | ||
|
|
||
| func main() { | ||
| xmlInput := `<body><p>Hello World</p></body>` | ||
|
|
||
| out, err := tocontenttree.Transform(xmlInput) | ||
| if err != nil { | ||
| log.Fatalf("Transform (XmlToTree) failed: %v", err) | ||
| } | ||
|
|
||
| fmt.Printf("Transformed content tree: %+v\n", out) | ||
| } | ||
| ``` | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| package tocontenttree | ||
|
|
||
| import ( | ||
| "strings" | ||
|
|
||
| "github.com/beevik/etree" | ||
| ) | ||
|
|
||
| type layoutwidth string | ||
|
|
||
| func toValidLayoutWidth(w string) layoutwidth { | ||
| switch w { | ||
| case "auto", "in-line", "inset-left", "inset-right", | ||
| "full-bleed", "full-grid", "mid-grid", "full-width": | ||
| return layoutwidth(w) | ||
| default: | ||
| return "full-width" | ||
| } | ||
| } | ||
|
|
||
| func toValidClipLayoutWidth(w string) layoutwidth { | ||
| switch w { | ||
| case "in-line", "full-grid", "mid-grid": | ||
| return layoutwidth(w) | ||
| default: | ||
| return "in-line" | ||
| } | ||
|
|
||
| } | ||
|
|
||
| func findChild(el *etree.Element, tag string) *etree.Element { | ||
| for _, ch := range el.ChildElements() { | ||
| if ch.Tag == tag { | ||
| return ch | ||
| } | ||
| if found := findChild(ch, tag); found != nil { | ||
| return found | ||
| } | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func textContent(el *etree.Element) string { | ||
| var b strings.Builder | ||
| for _, tok := range el.Child { | ||
| switch t := tok.(type) { | ||
| case *etree.CharData: | ||
| b.WriteString(t.Data) | ||
| case *etree.Element: | ||
| b.WriteString(textContent(t)) | ||
| } | ||
| } | ||
| return b.String() | ||
| } | ||
|
|
||
| func flattenedChildren(el *etree.Element) []etree.Token { | ||
| out := make([]etree.Token, 0, len(el.Child)) | ||
| for _, tok := range el.Child { | ||
| if d, ok := tok.(*etree.Element); ok && d.Tag == "div" { | ||
| out = append(out, d.Child...) | ||
| } else { | ||
| out = append(out, tok) | ||
| } | ||
| } | ||
| return out | ||
| } | ||
|
|
||
| func valueOr(v, fallback string) string { | ||
| if v != "" { | ||
| return v | ||
| } | ||
| return fallback | ||
| } | ||
|
|
||
| func attr(el *etree.Element, name string) string { | ||
| return el.SelectAttrValue(name, "") | ||
| } | ||
|
|
||
| var contentTypeTemplates = map[string]string{ | ||
| "http://www.ft.com/ontology/content/Article": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/ImageSet": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/ClipSet": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/CustomCodeComponent": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/MediaResource": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/Video": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/company/PublicCompany": "/organisations/{{id}}", | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: I am pretty sure that some old content pieces will have: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mimicked the transformation we perform in content-public-read. I don't see this URL their in the configs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we map There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think my previous comment is not very useful. I saw that |
||
| "http://www.ft.com/ontology/content/ContentPackage": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/Content": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/Image": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/DynamicContent": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/Graphic": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/content/Audio": "/content/{{id}}", | ||
| "http://www.ft.com/ontology/company/Organisation": "/organisations/{{id}}", | ||
| } | ||
|
|
||
| func generateUrl(t, id string) string { | ||
| const host = "http://api.ft.com" | ||
| template, ok := contentTypeTemplates[t] | ||
| if !ok { | ||
| return "" | ||
| } | ||
| path := strings.Replace(template, "{{id}}", id, 1) | ||
| return host + path | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nitpick:
internalcomponentscollection.nitpick: Maybe we can expand the description slightly to mention what is the difference between the two representations supported by the transformer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added tad bit of more information on this.