44 "encoding/json"
55 "errors"
66 "fmt"
7+ "html"
78 "strings"
89
910 contenttree "github.com/Financial-Times/content-tree"
@@ -62,7 +63,7 @@ func transformNode(n contenttree.Node) (string, error) {
6263 return fmt .Sprintf ("<body>%s</body>" , innerXML ), nil
6364
6465 case * contenttree.Text :
65- return node .Value , nil
66+ return html . EscapeString ( node .Value ) , nil
6667
6768 case * contenttree.Break :
6869 return "<br>" , nil
@@ -84,9 +85,11 @@ func transformNode(n contenttree.Node) (string, error) {
8485 if node .Level == "label" {
8586 tag = "h4"
8687 }
88+
8789 if tag == "" {
8890 return "" , fmt .Errorf ("failed to transform heading with level %s" , node .Level )
8991 }
92+
9093 return fmt .Sprintf ("<%[1]s>%s</%[1]s>" , tag , innerXML ), nil
9194
9295 case * contenttree.Strong :
@@ -98,27 +101,19 @@ func transformNode(n contenttree.Node) (string, error) {
98101 case * contenttree.Strikethrough :
99102 return fmt .Sprintf ("<s>%s</s>" , innerXML ), nil
100103
101- // TODO: This implementation is a placeholder. There are different types of links which need to be transformed to
102- // different XHTML tags. For example, there are links that need to be transformed into "<ft-content>" or
103- // "<ft-related>" tags, there are anchors links that shouldn't be transformed at all, and there are regular links
104- // that should be transformed into <a> tags.
105- // This implementation is a placeholder which handles only a link to an FT article.
106- // In seems that the content tree link object at the moment does not provide enough information to distinguish
107- // between different types of links.
108- // Example(https://www.ft.com/content/069e537a-ffc2-11e7-9650-9c0ad2d7c5b5):
109- // <ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/674697de-fbb5-11e7-9b32-d7d59aace167\" title=\"Apple pledges to invest $30bn and pay $38bn tax bill\">plans to spend $350bn</ft-content>
110104 case * contenttree.Link :
111- parts := strings .Split (node .URL , "/" )
112105 if node .Title != "" {
113- return fmt .Sprintf ("<ft-content type =\" http://www.ft.com/ontology/content/Article \" url= \" http://api.ft.com/content/ %s\" title=\" %s\" >%s</ft-content >" , parts [ len ( parts ) - 1 ] , node .Title , innerXML ), nil
106+ return fmt .Sprintf ("<a href =\" %s\" title=\" %s\" >%s</a >" , node . URL , node .Title , innerXML ), nil
114107 }
115- return fmt .Sprintf ("<ft-content type=\" http://www.ft.com/ontology/content/Article\" url=\" http://api.ft.com/content/%s\" >%s</ftcontent>" , parts [len (parts )- 1 ], innerXML ), nil
108+
109+ return fmt .Sprintf ("<a href=\" %s\" >%s</a>" , node .URL , innerXML ), nil
116110
117111 case * contenttree.List :
118112 tag := "ul"
119113 if node .Ordered {
120114 tag = "ol"
121115 }
116+
122117 return fmt .Sprintf ("<%[1]s>%s</%[1]s>" , tag , innerXML ), nil
123118
124119 case * contenttree.ListItem :
@@ -127,29 +122,14 @@ func transformNode(n contenttree.Node) (string, error) {
127122 case * contenttree.Blockquote :
128123 return fmt .Sprintf ("<blockquote>%s</blockquote>" , innerXML ), nil
129124
130- // TODO: The <pull-quote> tag is not a standard HTML tag, it is a custom tag used by the FT. It is worth to
131- // reconsider whether external consumers should receive this tag or it should be transformed into a standard HTML.
132- // TODO: The pull-quote node doesn't support children called <pull-quote-image> in the HTML representation.
133- // There is old content with <pull-quote-image> tags.
134- // Example(https://www.ft.com/content/e76980da-3585-11e7-99bd-13beb0903fa3):
135- // <pull-quote>
136- // <pull-quote-text><p>Norwegian has been very lucky that as they’ve grown, the fuel price has halved. I think without that they wouldn’t be around</p></pull-quote-text>
137- // <pull-quote-image><ft-content type=\"http://www.ft.com/ontology/content/ImageSet\" url=\"http://api.ft.com/content/1888fefa-3718-11e7-07db-84246ae494ea\" data-embedded=\"true\"></ft-content></pull-quote-image>
138- // <pull-quote-source>Oliver Sleath, analyst at Barclays</pull-quote-source>
139- // </pull-quote>
140125 case * contenttree.Pullquote :
141126 return fmt .Sprintf ("<pull-quote><pull-quote-text><p>%s</p></pull-quote-text><pull-quote-source>%s</pull-quote-source></pull-quote>" , node .Text , node .Source ), nil
142127
143128 case * contenttree.ImageSet :
144129 return fmt .Sprintf ("<ft-content type=\" http://www.ft.com/ontology/content/ImageSet\" url=\" http://api.ft.com/content/%s\" data-embedded=\" true\" ></ft-content>" , node .ID ), nil
145130
146- // TODO: The flourish tags are defined with alt attribute which seems not to be defined in the content tree.
147- // TODO: There is a significant oversight in the content API related to flourish elements. When the content policy
148- // "INCLUDE_RICH_CONTENT" is not applied, all <ft-content> tags are removed except of the flourish ones.
149- // Example(https://www.ft.com/content/67b7e066-7bb4-4ae2-9557-77b9052279aa):
150- // <ft-content type=\"http://www.ft.com/ontology/content/Content\" url=\"http://api.ft.com/content/20543674\" alt=\"\" data-asset-type=\"flourish\" data-embedded=\"true\" data-flourish-type=\"visualisation\" data-layout-width=\"\" data-time-stamp=\"\" id=\"20543674\"></ft-content>
151131 case * contenttree.Flourish :
152- return fmt .Sprintf ("<ft-content type=\" http://www.ft.com/ontology/content/Content\" url=\" http://api.ft.com/content/%[1]s\" alt=\" \" data-asset-type=\" flourish\" data-embedded=\" true\" data-flourish-type=\" %s\" data-layout-width=\" %s\" data-time-stamp=\" %s\" id=\" %[1]s\" ></ft-content>" , node .Id , node .FlourishType , node .LayoutWidth , node .Timestamp ), nil
132+ return fmt .Sprintf ("<ft-content type=\" http://www.ft.com/ontology/content/Content\" url=\" http://api.ft.com/content/%[1]s\" alt=\" %s \" data-asset-type=\" flourish\" data-embedded=\" true\" data-flourish-type=\" %s\" data-layout-width=\" %s\" data-time-stamp=\" %s\" id=\" %[1]s\" ></ft-content>" , node .Id , node . Description , node .FlourishType , node .LayoutWidth , node .Timestamp ), nil
153133
154134 case * contenttree.TableCaption :
155135 return fmt .Sprintf ("<caption>%s</caption>" , innerXML ), nil
@@ -166,7 +146,8 @@ func transformNode(n contenttree.Node) (string, error) {
166146 case * contenttree.TableFooter :
167147 return fmt .Sprintf ("<tfoot>%s</tfoot>" , innerXML ), nil
168148
169- // TODO: The tables have multiple attributes such as
149+ // TODO: Additional work on table tags will be required as per the resolution of https://github.com/Financial-Times/content-tree/issues/71
150+ // The tables have multiple attributes such as
170151 // class=\"data-table\"
171152 // data-table-collapse-rownum=\"\"
172153 // data-table-layout-largescreen=\"auto\"
@@ -176,24 +157,14 @@ func transformNode(n contenttree.Node) (string, error) {
176157 case * contenttree.Table :
177158 return fmt .Sprintf ("<table>%s</table>" , innerXML ), nil
178159
179- // Example(https://www.ft.com/content/9c0516cf-dd12-4665-aa22-712de854fe2f):
180- // <ft-content type=\"http://www.ft.com/ontology/content/Video\" url=\"http://api.ft.com/content/1c199563-e2cd-4817-990f-79972f3828fb\" data-embedded=\"true\"></ft-content>
181160 case * contenttree.Video :
182- return fmt .Sprintf ("<ft-content type=\" http://www.ft.com/ontology/content/Video\" url=\" http://api.ft.com/content/%s\" data-embedded=\" %t \" ></ft-content>" , node .ID , node . Embedded ), nil
161+ return fmt .Sprintf ("<ft-content type=\" http://www.ft.com/ontology/content/Video\" url=\" http://api.ft.com/content/%s\" data-embedded=\" true \" ></ft-content>" , node .ID ), nil
183162
184- // TODO: The XHTML representation is more generic, applicable to any video source. Nothing specifies that the video
185- // source is YouTube.
186- // Example(https://www.ft.com/content/4d9396e4-cb4b-4937-baa3-97fce6f5cb94):
187- // <a data-asset-type=\"video\" data-embedded=\"true\" href=\"https://www.youtube.com/watch?v=Y_uIs_Z9z4w\"></a>
188163 case * contenttree.YoutubeVideo :
189164 return fmt .Sprintf ("<a data-asset-type=\" video\" data-embedded=\" true\" href=\" %s\" ></a>" , node .URL ), nil
190165
191- // TODO: The tweets were represented as anchor tags which require href url to the tweet.
192- // The current content tree definition does not include the url attribute.
193- // Example (https://www.ft.com/content/b2899d25-9b16-461d-b406-89cfcadf3afc):
194- // <a data-asset-type=\"tweet\" data-embedded=\"true\" href=\"https://x.com/sama/status/1882106524090482701\">https://x.com/sama/status/1882106524090482701</a>
195166 case * contenttree.Tweet :
196- return fmt .Sprintf ("<a data-asset-type=\" tweet\" data-embedded=\" true\" href=\" %[1]s\" >%[1]s</a>" , "unknown url from the tweet" ), nil
167+ return fmt .Sprintf ("<a data-asset-type=\" tweet\" data-embedded=\" true\" href=\" %[1]s\" >%[1]s</a>" , node . ID ), nil
197168
198169 // Example from the Native Store to keep the translucent namespace (https://www.ft.com/content/9675cf79-f16d-4132-ab73-8bafa22ee4fc):
199170 // <tr:scrollable-block theme=\"1\">
@@ -231,38 +202,19 @@ func transformNode(n contenttree.Node) (string, error) {
231202 // translucent:scrollable-text. It is not clear how this behaviour is to be replicated.
232203
233204 // content tree nodes that were published inside experimental tag and as such are not supported in the "external"
234- // body XML format
205+ // body XML format for now
235206 case * contenttree.Layout :
236207 return "" , nil
237208 case * contenttree.LayoutSlot :
238209 return "" , nil
239210 case * contenttree.LayoutImage :
240211 return "" , nil
241212
242- // Example(https://www.ft.com/content/bb94946c-1c76-11e8-aaca-4574d7dabfb6):
243- // <recommended>
244- // <recommended-title>Recommended</recommended-title>
245- // <ul>
246- // <li><ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/c46e915c-1bde-11e8-956a-43db76e69936\">Brussels primes political ‘grenades’ in first draft of Brexit treaty</ft-content></li>
247- // <li><ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/63423938-1bde-11e8-aaca-4574d7dabfb6\">Michel Barnier expresses frustration with David Davis over Brexit talks</ft-content></li>
248- // <li><ft-content type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/515a2b2c-1c60-11e8-aaca-4574d7dabfb6\">Irish border ‘being used to keep UK in EU’ says Johnson</ft-content></li>
249- // </ul>
250- // </recommended>
251- // TODO: It seems that <recommended> tags are always published as opaque tags.
252213 case * contenttree.Recommended :
253214 return "" , nil
254215
255- // Example(https://www.ft.com/content/0107f2fa-f75c-11e6-9516-2d969e0d3b65)
256- // <big-number>
257- // <big-number-headline>
258- // <p>34%</p>
259- // </big-number-headline>
260- // <big-number-intro>
261- // <p>Gender pay gap in financial services, the largest in any sector according to Pwc</p>
262- // </big-number-intro>
263- // </big-number>
264216 case * contenttree.BigNumber :
265- return fmt .Sprintf ("<big-number><big-number-headline><p> %s</p></ big-number-headline><big-number-intro><p>%s</p> </big-number-intro></big-number>" , node .Number , node .Description ), nil
217+ return fmt .Sprintf ("<big-number><big-number-headline>%s</big-number-headline><big-number-intro>%s </big-number-intro></big-number>" , node .Number , node .Description ), nil
266218
267219 // CCC nodes won't be available in the "external" body XML format.
268220 case * contenttree.CustomCodeComponent :
@@ -291,50 +243,3 @@ func transformNode(n contenttree.Node) (string, error) {
291243
292244 return "" , nil
293245}
294-
295- // TODO: The namespaces "opaque" and "translucent" are not part of the content tree definition.
296- // However they are important for the transformation to the "external" XHTML. By definition each opaque tag
297- // is stripped along with all its children from the XHTML. The translucent tags are stripped but their children
298- // are kept in the XHTML.
299- // Example of opaque tag(https://www.ft.com/content/6a858bff-476a-44f7-91af-da636f0d6b93):
300- // <opaque:recommended>
301- // <recommended-title>Recommended</recommended-title>
302- // <ul>
303- // <li><content type=\"http://www.ft.com/ontology/content/Content\" id=\"52946dd2-7316-420b-aa0b-2ac13ea5ea68\">Syria caught up in Lebanon fallout</content></li>
304- // </ul>
305- // </opaque:recommended>
306- // Example of translucent is the scrollable block example above.
307- // The recent implementation of anchor tags relies on the "translucent" namespace as well.
308-
309- // TODO: The content tree definition lacks "concept" nodes. However, there is a lot of content pieces published
310- // with <concept>/<ft-concept> tags in the past.
311- // Example(https://www.ft.com/content/a2868e64-4e37-11e4-bfda-00144feab7de):
312- // <ft-concept type=\"http://www.ft.com/ontology/company/PublicCompany\" url=\"http://api.ft.com/organisations/897610dc-fc82-3257-a4f7-c26abfca3bb6\">edX</ft-concept>
313-
314- // TODO: The content tree definition lacks "related" nodes. There very very few old content pieces (2) published
315- // with <related>/<ft-related> tags.
316- // Example(https://www.ft.com/content/fa6de70c-e9b8-11e6-893c-082c54a7f539 and https://www.ft.com/content/8885c026-8a1b-11e6-8cb7-e7ada1d123b1):
317- // <ft-related type=\"http://www.ft.com/ontology/content/Article\" url=\"http://api.ft.com/content/cc5795be-bb25-11e6-8b45-b8b81dd5d080\">
318- // <title>Analysis</title>
319- // <headline>Venezuela struggles to tame triple-digit inflation\n</headline>
320- // <media><ft-content type=\"http://www.ft.com/ontology/content/ImageSet\" url=\"http://api.ft.com/content/aee2c57e-dc0b-11e6-18ca-65c901033a8f\" data-embedded=\"true\"></ft-content></media>
321- // <intro><p>Shopkeepers resort to weighing banknotes in echo of Weimar Germany</p></intro>
322- // </ft-related>
323-
324- // TODO: There is a tag <promo-box> which doesn't seem to be used anymore but there is still old content using it.
325- // Example(https://www.ft.com/content/50c06966-277c-11e3-ae16-00144feab7de):
326- // <promo-box>
327- // <promo-title><p>In depth</p></promo-title>
328- // <promo-headline><p><a href=\"http://www.ft.com/indepth/libor-scandal\" title=\"Libor scandal in depth - FT.com\">Libor scandal</a>\n</p></promo-headline>
329- // <promo-image><ft-content type=\"http://www.ft.com/ontology/content/ImageSet\" url=\"http://api.ft.com/content/ecec1d70-6b95-11e1-3243-978e959e1fd3\" data-embedded=\"true\"></ft-content></promo-image>
330- // <promo-intro><p>Regulators across the globe probe alleged manipulation by US and European banks of the London interbank offered rate and other key benchmark lending rates</p></promo-intro>
331- // </promo-box>
332-
333- // TODO: We introduced a new tag for CCC - <fallback> which is not part of the content tree definition.
334-
335- // TODO: There are many node types which have property "Data". It is not clear how it is utilised and whether it
336- // should be taken in consideration when transforming to the "external" XHTML version of the content.
337- // Such nodes are Blockquote, Link, Table, Video, YoutubeVideo and more.
338-
339- // TODO: The bodyXML of many articles contain "\n" which is not represented in the content tree. The transformer won't
340- // be able to produce exactly the same bodyXML as the one published with "\n" in it.
0 commit comments