Skip to content

Commit 86445db

Browse files
authored
Merge pull request #92 from Financial-Times/feature/improved-to-external-bodyxml-transformer
Feature/improved to external bodyxml transformer
2 parents 63b964d + a201e80 commit 86445db

File tree

7 files changed

+200
-124
lines changed

7 files changed

+200
-124
lines changed

content_tree.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1752,10 +1752,9 @@ func (n *Tweet) GetChildren() []Node {
17521752
}
17531753

17541754
type Video struct {
1755-
Type string `json:"type"`
1756-
Data interface{} `json:"data,omitempty"`
1757-
Embedded bool `json:"embedded,omitempty"`
1758-
ID string `json:"id,omitempty"`
1755+
Type string `json:"type"`
1756+
Data interface{} `json:"data,omitempty"`
1757+
ID string `json:"id,omitempty"`
17591758
}
17601759

17611760
func (n *Video) GetType() string {

libraries/to-external-bodyxml/go/transform.go

Lines changed: 15 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"encoding/json"
55
"errors"
66
"fmt"
7+
"html"
78
"strings"
89

910
contenttree "github.com/Financial-Times/content-tree"
@@ -62,7 +63,7 @@ func transformNode(n contenttree.Node) (string, error) {
6263
return fmt.Sprintf("<body>%s</body>", innerXML), nil
6364

6465
case *contenttree.Text:
65-
return node.Value, nil
66+
return html.EscapeString(node.Value), nil
6667

6768
case *contenttree.Break:
6869
return "<br>", nil
@@ -84,9 +85,11 @@ func transformNode(n contenttree.Node) (string, error) {
8485
if node.Level == "label" {
8586
tag = "h4"
8687
}
88+
8789
if tag == "" {
8890
return "", fmt.Errorf("failed to transform heading with level %s", node.Level)
8991
}
92+
9093
return fmt.Sprintf("<%[1]s>%s</%[1]s>", tag, innerXML), nil
9194

9295
case *contenttree.Strong:
@@ -98,27 +101,19 @@ func transformNode(n contenttree.Node) (string, error) {
98101
case *contenttree.Strikethrough:
99102
return fmt.Sprintf("<s>%s</s>", innerXML), nil
100103

101-
// TODO: This implementation is a placeholder. There are different types of links which need to be transformed to
102-
// different XHTML tags. For example, there are links that need to be transformed into "<ft-content>" or
103-
// "<ft-related>" tags, there are anchors links that shouldn't be transformed at all, and there are regular links
104-
// that should be transformed into <a> tags.
105-
// This implementation is a placeholder which handles only a link to an FT article.
106-
// In seems that the content tree link object at the moment does not provide enough information to distinguish
107-
// between different types of links.
108-
// Example(https://www.ft.com/content/069e537a-ffc2-11e7-9650-9c0ad2d7c5b5):
109-
// <ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/674697de-fbb5-11e7-9b32-d7d59aace167\" title=\"Apple pledges to invest $30bn and pay $38bn tax bill\">plans to spend $350bn</ft-content>
110104
case *contenttree.Link:
111-
parts := strings.Split(node.URL, "/")
112105
if node.Title != "" {
113-
return fmt.Sprintf("<ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/%s\" title=\"%s\">%s</ft-content>", parts[len(parts)-1], node.Title, innerXML), nil
106+
return fmt.Sprintf("<a href=\"%s\" title=\"%s\">%s</a>", node.URL, node.Title, innerXML), nil
114107
}
115-
return fmt.Sprintf("<ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/%s\">%s</ftcontent>", parts[len(parts)-1], innerXML), nil
108+
109+
return fmt.Sprintf("<a href=\"%s\">%s</a>", node.URL, innerXML), nil
116110

117111
case *contenttree.List:
118112
tag := "ul"
119113
if node.Ordered {
120114
tag = "ol"
121115
}
116+
122117
return fmt.Sprintf("<%[1]s>%s</%[1]s>", tag, innerXML), nil
123118

124119
case *contenttree.ListItem:
@@ -127,29 +122,14 @@ func transformNode(n contenttree.Node) (string, error) {
127122
case *contenttree.Blockquote:
128123
return fmt.Sprintf("<blockquote>%s</blockquote>", innerXML), nil
129124

130-
// TODO: The <pull-quote> tag is not a standard HTML tag, it is a custom tag used by the FT. It is worth to
131-
// reconsider whether external consumers should receive this tag or it should be transformed into a standard HTML.
132-
// TODO: The pull-quote node doesn't support children called <pull-quote-image> in the HTML representation.
133-
// There is old content with <pull-quote-image> tags.
134-
// Example(https://www.ft.com/content/e76980da-3585-11e7-99bd-13beb0903fa3):
135-
// <pull-quote>
136-
// <pull-quote-text><p>Norwegian has been very lucky that as they’ve grown, the fuel price has halved. I think without that they wouldn’t be around</p></pull-quote-text>
137-
// <pull-quote-image><ft-content type=\"http://www.ft.com/ontology/content/ImageSet\" url=\"http://api.ft.com/content/1888fefa-3718-11e7-07db-84246ae494ea\" data-embedded=\"true\"></ft-content></pull-quote-image>
138-
// <pull-quote-source>Oliver Sleath, analyst at Barclays</pull-quote-source>
139-
// </pull-quote>
140125
case *contenttree.Pullquote:
141126
return fmt.Sprintf("<pull-quote><pull-quote-text><p>%s</p></pull-quote-text><pull-quote-source>%s</pull-quote-source></pull-quote>", node.Text, node.Source), nil
142127

143128
case *contenttree.ImageSet:
144129
return fmt.Sprintf("<ft-content type=\"http://www.ft.com/ontology/content/ImageSet\" url=\"http://api.ft.com/content/%s\" data-embedded=\"true\"></ft-content>", node.ID), nil
145130

146-
// TODO: The flourish tags are defined with alt attribute which seems not to be defined in the content tree.
147-
// TODO: There is a significant oversight in the content API related to flourish elements. When the content policy
148-
// "INCLUDE_RICH_CONTENT" is not applied, all <ft-content> tags are removed except of the flourish ones.
149-
// Example(https://www.ft.com/content/67b7e066-7bb4-4ae2-9557-77b9052279aa):
150-
// <ft-content type=\"http://www.ft.com/ontology/content/Content\" url=\"http://api.ft.com/content/20543674\" alt=\"\" data-asset-type=\"flourish\" data-embedded=\"true\" data-flourish-type=\"visualisation\" data-layout-width=\"\" data-time-stamp=\"\" id=\"20543674\"></ft-content>
151131
case *contenttree.Flourish:
152-
return fmt.Sprintf("<ft-content type=\"http://www.ft.com/ontology/content/Content\" url=\"http://api.ft.com/content/%[1]s\" alt=\"\" data-asset-type=\"flourish\" data-embedded=\"true\" data-flourish-type=\"%s\" data-layout-width=\"%s\" data-time-stamp=\"%s\" id=\"%[1]s\"></ft-content>", node.Id, node.FlourishType, node.LayoutWidth, node.Timestamp), nil
132+
return fmt.Sprintf("<ft-content type=\"http://www.ft.com/ontology/content/Content\" url=\"http://api.ft.com/content/%[1]s\" alt=\"%s\" data-asset-type=\"flourish\" data-embedded=\"true\" data-flourish-type=\"%s\" data-layout-width=\"%s\" data-time-stamp=\"%s\" id=\"%[1]s\"></ft-content>", node.Id, node.Description, node.FlourishType, node.LayoutWidth, node.Timestamp), nil
153133

154134
case *contenttree.TableCaption:
155135
return fmt.Sprintf("<caption>%s</caption>", innerXML), nil
@@ -166,7 +146,8 @@ func transformNode(n contenttree.Node) (string, error) {
166146
case *contenttree.TableFooter:
167147
return fmt.Sprintf("<tfoot>%s</tfoot>", innerXML), nil
168148

169-
// TODO: The tables have multiple attributes such as
149+
// TODO: Additional work on table tags will be required as per the resolution of https://github.com/Financial-Times/content-tree/issues/71
150+
// The tables have multiple attributes such as
170151
// class=\"data-table\"
171152
// data-table-collapse-rownum=\"\"
172153
// data-table-layout-largescreen=\"auto\"
@@ -176,24 +157,14 @@ func transformNode(n contenttree.Node) (string, error) {
176157
case *contenttree.Table:
177158
return fmt.Sprintf("<table>%s</table>", innerXML), nil
178159

179-
// Example(https://www.ft.com/content/9c0516cf-dd12-4665-aa22-712de854fe2f):
180-
// <ft-content type=\"http://www.ft.com/ontology/content/Video\" url=\"http://api.ft.com/content/1c199563-e2cd-4817-990f-79972f3828fb\" data-embedded=\"true\"></ft-content>
181160
case *contenttree.Video:
182-
return fmt.Sprintf("<ft-content type=\"http://www.ft.com/ontology/content/Video\" url=\"http://api.ft.com/content/%s\" data-embedded=\"%t\"></ft-content>", node.ID, node.Embedded), nil
161+
return fmt.Sprintf("<ft-content type=\"http://www.ft.com/ontology/content/Video\" url=\"http://api.ft.com/content/%s\" data-embedded=\"true\"></ft-content>", node.ID), nil
183162

184-
// TODO: The XHTML representation is more generic, applicable to any video source. Nothing specifies that the video
185-
// source is YouTube.
186-
// Example(https://www.ft.com/content/4d9396e4-cb4b-4937-baa3-97fce6f5cb94):
187-
// <a data-asset-type=\"video\" data-embedded=\"true\" href=\"https://www.youtube.com/watch?v=Y_uIs_Z9z4w\"></a>
188163
case *contenttree.YoutubeVideo:
189164
return fmt.Sprintf("<a data-asset-type=\"video\" data-embedded=\"true\" href=\"%s\"></a>", node.URL), nil
190165

191-
// TODO: The tweets were represented as anchor tags which require href url to the tweet.
192-
// The current content tree definition does not include the url attribute.
193-
// Example (https://www.ft.com/content/b2899d25-9b16-461d-b406-89cfcadf3afc):
194-
// <a data-asset-type=\"tweet\" data-embedded=\"true\" href=\"https://x.com/sama/status/1882106524090482701\">https://x.com/sama/status/1882106524090482701</a>
195166
case *contenttree.Tweet:
196-
return fmt.Sprintf("<a data-asset-type=\"tweet\" data-embedded=\"true\" href=\"%[1]s\">%[1]s</a>", "unknown url from the tweet"), nil
167+
return fmt.Sprintf("<a data-asset-type=\"tweet\" data-embedded=\"true\" href=\"%[1]s\">%[1]s</a>", node.ID), nil
197168

198169
// Example from the Native Store to keep the translucent namespace (https://www.ft.com/content/9675cf79-f16d-4132-ab73-8bafa22ee4fc):
199170
// <tr:scrollable-block theme=\"1\">
@@ -231,38 +202,19 @@ func transformNode(n contenttree.Node) (string, error) {
231202
// translucent:scrollable-text. It is not clear how this behaviour is to be replicated.
232203

233204
// content tree nodes that were published inside experimental tag and as such are not supported in the "external"
234-
// body XML format
205+
// body XML format for now
235206
case *contenttree.Layout:
236207
return "", nil
237208
case *contenttree.LayoutSlot:
238209
return "", nil
239210
case *contenttree.LayoutImage:
240211
return "", nil
241212

242-
// Example(https://www.ft.com/content/bb94946c-1c76-11e8-aaca-4574d7dabfb6):
243-
// <recommended>
244-
// <recommended-title>Recommended</recommended-title>
245-
// <ul>
246-
// <li><ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/c46e915c-1bde-11e8-956a-43db76e69936\">Brussels primes political ‘grenades’ in first draft of Brexit treaty</ft-content></li>
247-
// <li><ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/63423938-1bde-11e8-aaca-4574d7dabfb6\">Michel Barnier expresses frustration with David Davis over Brexit talks</ft-content></li>
248-
// <li><ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/515a2b2c-1c60-11e8-aaca-4574d7dabfb6\">Irish border ‘being used to keep UK in EU’ says Johnson</ft-content></li>
249-
// </ul>
250-
// </recommended>
251-
// TODO: It seems that <recommended> tags are always published as opaque tags.
252213
case *contenttree.Recommended:
253214
return "", nil
254215

255-
// Example(https://www.ft.com/content/0107f2fa-f75c-11e6-9516-2d969e0d3b65)
256-
// <big-number>
257-
// <big-number-headline>
258-
// <p>34%</p>
259-
// </big-number-headline>
260-
// <big-number-intro>
261-
// <p>Gender pay gap in financial services, the largest in any sector according to Pwc</p>
262-
// </big-number-intro>
263-
// </big-number>
264216
case *contenttree.BigNumber:
265-
return fmt.Sprintf("<big-number><big-number-headline><p>%s</p></big-number-headline><big-number-intro><p>%s</p></big-number-intro></big-number>", node.Number, node.Description), nil
217+
return fmt.Sprintf("<big-number><big-number-headline>%s</big-number-headline><big-number-intro>%s</big-number-intro></big-number>", node.Number, node.Description), nil
266218

267219
// CCC nodes won't be available in the "external" body XML format.
268220
case *contenttree.CustomCodeComponent:
@@ -291,50 +243,3 @@ func transformNode(n contenttree.Node) (string, error) {
291243

292244
return "", nil
293245
}
294-
295-
// TODO: The namespaces "opaque" and "translucent" are not part of the content tree definition.
296-
// However they are important for the transformation to the "external" XHTML. By definition each opaque tag
297-
// is stripped along with all its children from the XHTML. The translucent tags are stripped but their children
298-
// are kept in the XHTML.
299-
// Example of opaque tag(https://www.ft.com/content/6a858bff-476a-44f7-91af-da636f0d6b93):
300-
// <opaque:recommended>
301-
// <recommended-title>Recommended</recommended-title>
302-
// <ul>
303-
// <li><content type=\"http://www.ft.com/ontology/content/Content\" id=\"52946dd2-7316-420b-aa0b-2ac13ea5ea68\">Syria caught up in Lebanon fallout</content></li>
304-
// </ul>
305-
// </opaque:recommended>
306-
// Example of translucent is the scrollable block example above.
307-
// The recent implementation of anchor tags relies on the "translucent" namespace as well.
308-
309-
// TODO: The content tree definition lacks "concept" nodes. However, there is a lot of content pieces published
310-
// with <concept>/<ft-concept> tags in the past.
311-
// Example(https://www.ft.com/content/a2868e64-4e37-11e4-bfda-00144feab7de):
312-
// <ft-concept type=\"http://www.ft.com/ontology/company/PublicCompany\" url=\"http://api.ft.com/organisations/897610dc-fc82-3257-a4f7-c26abfca3bb6\">edX</ft-concept>
313-
314-
// TODO: The content tree definition lacks "related" nodes. There very very few old content pieces (2) published
315-
// with <related>/<ft-related> tags.
316-
// Example(https://www.ft.com/content/fa6de70c-e9b8-11e6-893c-082c54a7f539 and https://www.ft.com/content/8885c026-8a1b-11e6-8cb7-e7ada1d123b1):
317-
// <ft-related type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/cc5795be-bb25-11e6-8b45-b8b81dd5d080\">
318-
// <title>Analysis</title>
319-
// <headline>Venezuela struggles to tame triple-digit inflation\n</headline>
320-
// <media><ft-content type=\"http://www.ft.com/ontology/content/ImageSet\" url=\"http://api.ft.com/content/aee2c57e-dc0b-11e6-18ca-65c901033a8f\" data-embedded=\"true\"></ft-content></media>
321-
// <intro><p>Shopkeepers resort to weighing banknotes in echo of Weimar Germany</p></intro>
322-
// </ft-related>
323-
324-
// TODO: There is a tag <promo-box> which doesn't seem to be used anymore but there is still old content using it.
325-
// Example(https://www.ft.com/content/50c06966-277c-11e3-ae16-00144feab7de):
326-
// <promo-box>
327-
// <promo-title><p>In depth</p></promo-title>
328-
// <promo-headline><p><a href=\"http://www.ft.com/indepth/libor-scandal\" title=\"Libor scandal in depth - FT.com\">Libor scandal</a>\n</p></promo-headline>
329-
// <promo-image><ft-content type=\"http://www.ft.com/ontology/content/ImageSet\" url=\"http://api.ft.com/content/ecec1d70-6b95-11e1-3243-978e959e1fd3\" data-embedded=\"true\"></ft-content></promo-image>
330-
// <promo-intro><p>Regulators across the globe probe alleged manipulation by US and European banks of the London interbank offered rate and other key benchmark lending rates</p></promo-intro>
331-
// </promo-box>
332-
333-
// TODO: We introduced a new tag for CCC - <fallback> which is not part of the content tree definition.
334-
335-
// TODO: There are many node types which have property "Data". It is not clear how it is utilised and whether it
336-
// should be taken in consideration when transforming to the "external" XHTML version of the content.
337-
// Such nodes are Blockquote, Link, Table, Video, YoutubeVideo and more.
338-
339-
// TODO: The bodyXML of many articles contain "\n" which is not represented in the content tree. The transformer won't
340-
// be able to produce exactly the same bodyXML as the one published with "\n" in it.

libraries/to-external-bodyxml/go/transform_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,11 @@ func TestTransform(t *testing.T) {
2121
return
2222
}
2323

24-
if got != strings.TrimSpace(test.output) {
25-
t.Errorf("got: %s\n\n want: %s\n", got, test.output)
24+
want := strings.TrimSpace(test.output)
25+
got = strings.TrimSpace(got)
26+
27+
if got != want {
28+
t.Errorf("got: %s\n\n want: %s\n", got, want)
2629
}
2730
})
2831
}

0 commit comments

Comments
 (0)