From 66c99a00529d5e3cfe3fad5a260bf1c01437008c Mon Sep 17 00:00:00 2001 From: lokendersinghft Date: Thu, 25 Sep 2025 13:02:54 +0300 Subject: [PATCH] transform external bodyXML to content-tree in Go --- content_tree.go | 728 ++++++++++++++++-- go.mod | 6 +- go.sum | 4 + libraries/from-bodyxml/go/README.md | 31 + libraries/from-bodyxml/go/helpers.go | 104 +++ .../from-bodyxml/go/html_transformers.go | 366 +++++++++ libraries/from-bodyxml/go/transform.go | 134 ++++ libraries/from-bodyxml/go/transform_test.go | 95 +++ libraries/from-bodyxml/index.js | 12 +- 9 files changed, 1428 insertions(+), 52 deletions(-) create mode 100644 libraries/from-bodyxml/go/README.md create mode 100644 libraries/from-bodyxml/go/helpers.go create mode 100644 libraries/from-bodyxml/go/html_transformers.go create mode 100644 libraries/from-bodyxml/go/transform.go create mode 100644 libraries/from-bodyxml/go/transform_test.go diff --git a/content_tree.go b/content_tree.go index 08223d5..b832d3f 100644 --- a/content_tree.go +++ b/content_tree.go @@ -13,6 +13,11 @@ because the embedded structs contain a field with the same name, the field "Type According to the official Go documentation (https://pkg.go.dev/encoding/json), when multiple fields with the same name exist, during unmarshalling they are all ignored, and no error is returned. As a result, the objects are not unmarshalled correctly unless custom unmarshalling logic is applied. + +A custom MarshalJSON method is required for union wrapper structs (e.g. BodyBlock, Phrasing, BlockquoteChild, etc.) because they embed +multiple anonymous pointer fields that all export overlapping JSON field names like "type" and "data". The encoding/json package ignores conflicting +fields when marshalling, which results in empty "{}" objects. These MarshalJSON methods ensure only the active +(non-nil) embedded node is serialized. */ package contenttree @@ -60,6 +65,7 @@ const ( TableFooterType = "table-footer" TableType = "table" CustomCodeComponentType = "custom-code-component" + ClipSetType = "clip-set" BodyBlockType = "body-block" BlockquoteChildType = "blockquote-child" @@ -72,6 +78,13 @@ const ( TableChildType = "table-child" ) +var ( + // returned when calling AppendChild on a node that doesn't own a Children slice + ErrCannotHaveChildren = errors.New("node cannot have children") + // returned when a child is not one of the allowed types for a parent + ErrInvalidChildType = errors.New("invalid child type for this parent") +) + // Node represents a unified interface for different types of content tree nodes. // It facilitates easy traversal of the tree structure without requiring type casting. type Node interface { @@ -82,6 +95,16 @@ type Node interface { // GetEmbedded returns the embedded node, if applicable. // It is useful for traversing node structs which embed other node structs. GetEmbedded() Node + // AppendChild attempts to append a child node, returning an error if not allowed. + AppendChild(child Node) error +} + +// typed() is a small utility to read a node's type without full unmarshal. +func typed(v any) string { + if n, ok := v.(Node); ok && n != nil { + return n.GetType() + } + return "" } // typedNode is a lightweight struct that holds only the type information of a content tree node. @@ -103,9 +126,9 @@ type ColumnSettingsItems struct { type BigNumber struct { Type string `json:"type"` - Data interface{} `json:"data,omitempty"` - Description string `json:"description,omitempty"` - Number string `json:"number,omitempty"` + Data interface{} `json:"data,omitempty"` + Description string `json:"description"` + Number string `json:"number"` } func (n *BigNumber) GetType() string { @@ -120,9 +143,11 @@ func (n *BigNumber) GetChildren() []Node { return nil } +func (n *BigNumber) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type Blockquote struct { Type string `json:"type"` - Children []*BlockquoteChild `json:"children,omitempty"` + Children []*BlockquoteChild `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -142,6 +167,15 @@ func (n *Blockquote) GetChildren() []Node { return result } +func (n *Blockquote) AppendChild(child Node) error { + c, err := makeBlockquoteChild(child) + if err != nil { + return err + } + n.Children = append(n.Children, c) + return nil +} + type BlockquoteChild struct { *Paragraph *Text @@ -206,6 +240,8 @@ func (n *BlockquoteChild) GetChildren() []Node { return nil } +func (n *BlockquoteChild) AppendChild(child Node) error { return ErrCannotHaveChildren } + func (n *BlockquoteChild) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -261,9 +297,52 @@ func (n *BlockquoteChild) UnmarshalJSON(data []byte) error { return nil } +func (n *BlockquoteChild) MarshalJSON() ([]byte, error) { + switch { + case n.Paragraph != nil: + return json.Marshal(n.Paragraph) + case n.Text != nil: + return json.Marshal(n.Text) + case n.Break != nil: + return json.Marshal(n.Break) + case n.Strong != nil: + return json.Marshal(n.Strong) + case n.Emphasis != nil: + return json.Marshal(n.Emphasis) + case n.Strikethrough != nil: + return json.Marshal(n.Strikethrough) + case n.Link != nil: + return json.Marshal(n.Link) + default: + return []byte(`{}`), nil + } +} + +// Build a BlockquoteChild wrapper. +func makeBlockquoteChild(n Node) (*BlockquoteChild, error) { + switch n.GetType() { + case ParagraphType: + return &BlockquoteChild{Paragraph: n.(*Paragraph)}, nil + case TextType: + return &BlockquoteChild{Text: n.(*Text)}, nil + case BreakType: + return &BlockquoteChild{Break: n.(*Break)}, nil + case StrongType: + return &BlockquoteChild{Strong: n.(*Strong)}, nil + case EmphasisType: + return &BlockquoteChild{Emphasis: n.(*Emphasis)}, nil + case StrikethroughType: + return &BlockquoteChild{Strikethrough: n.(*Strikethrough)}, nil + case LinkType: + return &BlockquoteChild{Link: n.(*Link)}, nil + default: + return nil, ErrInvalidChildType + } +} + type Body struct { Type string `json:"type"` - Children []*BodyBlock `json:"children,omitempty"` + Children []*BodyBlock `json:"children"` Data interface{} `json:"data,omitempty"` Version float64 `json:"version,omitempty"` } @@ -284,6 +363,18 @@ func (n *Body) GetChildren() []Node { return result } +func (n *Body) AppendChild(child Node) error { + if n == nil { + return fmt.Errorf("nil Body: %w", ErrCannotHaveChildren) + } + bb, err := makeBodyBlock(child) + if err != nil { + return err + } + n.Children = append(n.Children, bb) + return nil +} + type BodyBlock struct { *Paragraph *Flourish @@ -303,6 +394,7 @@ type BodyBlock struct { *Video *YoutubeVideo *CustomCodeComponent + *ClipSet } func (n *BodyBlock) GetType() string { @@ -364,6 +456,9 @@ func (n *BodyBlock) GetEmbedded() Node { if n.CustomCodeComponent != nil { return n.CustomCodeComponent } + if n.ClipSet != nil { + return n.ClipSet + } return nil } @@ -422,9 +517,14 @@ func (n *BodyBlock) GetChildren() []Node { if n.CustomCodeComponent != nil { return n.CustomCodeComponent.GetChildren() } + if n.ClipSet != nil { + return n.ClipSet.GetChildren() + } return nil } +func (n *BodyBlock) AppendChild(_ Node) error { return ErrCannotHaveChildren } + func (n *BodyBlock) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -540,12 +640,109 @@ func (n *BodyBlock) UnmarshalJSON(data []byte) error { return err } n.CustomCodeComponent = &v + case ClipSetType: + var v ClipSet + if err := json.Unmarshal(data, &v); err != nil { + return err + } + n.ClipSet = &v default: return fmt.Errorf("failed to unmarshal BodyBlock from %s: %w", data, ErrUnmarshalInvalidNode) } return nil } +func (n *BodyBlock) MarshalJSON() ([]byte, error) { + switch { + case n.Paragraph != nil: + return json.Marshal(n.Paragraph) + case n.Flourish != nil: + return json.Marshal(n.Flourish) + case n.Heading != nil: + return json.Marshal(n.Heading) + case n.ImageSet != nil: + return json.Marshal(n.ImageSet) + case n.BigNumber != nil: + return json.Marshal(n.BigNumber) + case n.Layout != nil: + return json.Marshal(n.Layout) + case n.List != nil: + return json.Marshal(n.List) + case n.Blockquote != nil: + return json.Marshal(n.Blockquote) + case n.Pullquote != nil: + return json.Marshal(n.Pullquote) + case n.ScrollyBlock != nil: + return json.Marshal(n.ScrollyBlock) + case n.ThematicBreak != nil: + return json.Marshal(n.ThematicBreak) + case n.Table != nil: + return json.Marshal(n.Table) + case n.Text != nil: + return json.Marshal(n.Text) + case n.Recommended != nil: + return json.Marshal(n.Recommended) + case n.Tweet != nil: + return json.Marshal(n.Tweet) + case n.Video != nil: + return json.Marshal(n.Video) + case n.YoutubeVideo != nil: + return json.Marshal(n.YoutubeVideo) + case n.CustomCodeComponent != nil: + return json.Marshal(n.CustomCodeComponent) + case n.ClipSet != nil: + return json.Marshal(n.ClipSet) + default: + return []byte(`{}`), nil + } +} + +// Build a BodyBlock wrapper from any allowed top-level block node. +func makeBodyBlock(n Node) (*BodyBlock, error) { + switch n.GetType() { + case ParagraphType: + return &BodyBlock{Paragraph: n.(*Paragraph)}, nil + case FlourishType: + return &BodyBlock{Flourish: n.(*Flourish)}, nil + case HeadingType: + return &BodyBlock{Heading: n.(*Heading)}, nil + case ImageSetType: + return &BodyBlock{ImageSet: n.(*ImageSet)}, nil + case BigNumberType: + return &BodyBlock{BigNumber: n.(*BigNumber)}, nil + case LayoutType: + return &BodyBlock{Layout: n.(*Layout)}, nil + case ListType: + return &BodyBlock{List: n.(*List)}, nil + case BlockquoteType: + return &BodyBlock{Blockquote: n.(*Blockquote)}, nil + case PullquoteType: + return &BodyBlock{Pullquote: n.(*Pullquote)}, nil + case ScrollyBlockType: + return &BodyBlock{ScrollyBlock: n.(*ScrollyBlock)}, nil + case ThematicBreakType: + return &BodyBlock{ThematicBreak: n.(*ThematicBreak)}, nil + case TableType: + return &BodyBlock{Table: n.(*Table)}, nil + case TextType: + return &BodyBlock{Text: n.(*Text)}, nil + case RecommendedType: + return &BodyBlock{Recommended: n.(*Recommended)}, nil + case TweetType: + return &BodyBlock{Tweet: n.(*Tweet)}, nil + case VideoType: + return &BodyBlock{Video: n.(*Video)}, nil + case YoutubeVideoType: + return &BodyBlock{YoutubeVideo: n.(*YoutubeVideo)}, nil + case CustomCodeComponentType: + return &BodyBlock{CustomCodeComponent: n.(*CustomCodeComponent)}, nil + case ClipSetType: + return &BodyBlock{ClipSet: n.(*ClipSet)}, nil + default: + return nil, ErrInvalidChildType + } +} + type Break struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` @@ -563,9 +760,11 @@ func (n *Break) GetChildren() []Node { return nil } +func (n *Break) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type Emphasis struct { Type string `json:"type"` - Children []*Phrasing `json:"children,omitempty"` + Children []*Phrasing `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -585,15 +784,24 @@ func (n *Emphasis) GetChildren() []Node { return result } +func (n *Emphasis) AppendChild(child Node) error { + p, err := makePhrasing(child) + if err != nil { + return err + } + n.Children = append(n.Children, p) + return nil +} + type Flourish struct { - Type string `json:"type"` + Type string `json:"type"` Data interface{} `json:"data,omitempty"` - Description string `json:"description,omitempty"` + Description string `json:"description"` FallbackImage *FlourishFallbackImage `json:"fallbackImage,omitempty"` FlourishType string `json:"flourishType,omitempty"` Id string `json:"id,omitempty"` - LayoutWidth string `json:"layoutWidth,omitempty"` - Timestamp string `json:"timestamp,omitempty"` + LayoutWidth string `json:"layoutWidth"` + Timestamp string `json:"timestamp"` FragmentIdentifier string `json:"fragmentIdentifier,omitempty"` } @@ -609,6 +817,8 @@ func (n *Flourish) GetChildren() []Node { return nil } +func (n *Flourish) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type FlourishFallbackImage struct { Format string `json:"format,omitempty"` Height float64 `json:"height,omitempty"` @@ -626,7 +836,7 @@ type FlourishFallbackImageSourceSetElem struct { type Heading struct { Type string `json:"type"` - Children []*Text `json:"children,omitempty"` + Children []*Text `json:"children"` Data interface{} `json:"data,omitempty"` Level string `json:"level,omitempty"` FragmentIdentifier string `json:"fragmentIdentifier,omitempty"` @@ -648,10 +858,18 @@ func (n *Heading) GetChildren() []Node { return result } +func (n *Heading) AppendChild(child Node) error { + if child.GetType() != TextType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*Text)) + return nil +} + type ImageSet struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` - ID string `json:"id,omitempty"` + ID string `json:"id"` Picture *Picture `json:"picture,omitempty"` FragmentIdentifier string `json:"fragmentIdentifier,omitempty"` } @@ -668,9 +886,11 @@ func (n *ImageSet) GetChildren() []Node { return nil } +func (n *ImageSet) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type Layout struct { Type string `json:"type"` - Children []*LayoutChild `json:"children,omitempty"` + Children []*LayoutChild `json:"children"` Data interface{} `json:"data,omitempty"` LayoutName string `json:"layoutName,omitempty"` LayoutWidth string `json:"layoutWidth,omitempty"` @@ -692,6 +912,15 @@ func (n *Layout) GetChildren() []Node { return result } +func (n *Layout) AppendChild(child Node) error { + c, err := makeLayoutChild(child) + if err != nil { + return err + } + n.Children = append(n.Children, c) + return nil +} + type LayoutChild struct { *LayoutSlot *Heading @@ -728,6 +957,8 @@ func (n *LayoutChild) GetChildren() []Node { return nil } +func (n *LayoutChild) AppendChild(_ Node) error { return ErrCannotHaveChildren } + func (n *LayoutChild) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -759,13 +990,40 @@ func (n *LayoutChild) UnmarshalJSON(data []byte) error { return nil } +func (n *LayoutChild) MarshalJSON() ([]byte, error) { + switch { + case n.LayoutSlot != nil: + return json.Marshal(n.LayoutSlot) + case n.Heading != nil: + return json.Marshal(n.Heading) + case n.LayoutImage != nil: + return json.Marshal(n.LayoutImage) + default: + return []byte(`{}`), nil + } +} + +// Build LayoutChild wrapper. +func makeLayoutChild(n Node) (*LayoutChild, error) { + switch n.GetType() { + case LayoutSlotType: + return &LayoutChild{LayoutSlot: n.(*LayoutSlot)}, nil + case HeadingType: + return &LayoutChild{Heading: n.(*Heading)}, nil + case LayoutImageType: + return &LayoutChild{LayoutImage: n.(*LayoutImage)}, nil + default: + return nil, ErrInvalidChildType + } +} + type LayoutImage struct { Type string `json:"type"` - Alt string `json:"alt,omitempty"` - Caption string `json:"caption,omitempty"` - Credit string `json:"credit,omitempty"` + Alt string `json:"alt"` + Caption string `json:"caption"` + Credit string `json:"credit"` Data interface{} `json:"data,omitempty"` - ID string `json:"id,omitempty"` + ID string `json:"id"` Picture *Picture `json:"picture,omitempty"` } @@ -781,9 +1039,11 @@ func (n *LayoutImage) GetChildren() []Node { return nil } +func (n *LayoutImage) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type LayoutSlot struct { Type string `json:"type"` - Children []*LayoutSlotChild `json:"children,omitempty"` + Children []*LayoutSlotChild `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -803,6 +1063,15 @@ func (n *LayoutSlot) GetChildren() []Node { return result } +func (n *LayoutSlot) AppendChild(child Node) error { + c, err := makeLayoutSlotChild(child) + if err != nil { + return err + } + n.Children = append(n.Children, c) + return nil +} + type LayoutSlotChild struct { *Paragraph *Heading @@ -839,6 +1108,8 @@ func (n *LayoutSlotChild) GetChildren() []Node { return nil } +func (n *LayoutSlotChild) AppendChild(_ Node) error { return ErrCannotHaveChildren } + func (n *LayoutSlotChild) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -870,12 +1141,39 @@ func (n *LayoutSlotChild) UnmarshalJSON(data []byte) error { return nil } +func (n *LayoutSlotChild) MarshalJSON() ([]byte, error) { + switch { + case n.Paragraph != nil: + return json.Marshal(n.Paragraph) + case n.Heading != nil: + return json.Marshal(n.Heading) + case n.LayoutImage != nil: + return json.Marshal(n.LayoutImage) + default: + return []byte(`{}`), nil + } +} + +// Build LayoutSlotChild wrapper. +func makeLayoutSlotChild(n Node) (*LayoutSlotChild, error) { + switch n.GetType() { + case ParagraphType: + return &LayoutSlotChild{Paragraph: n.(*Paragraph)}, nil + case HeadingType: + return &LayoutSlotChild{Heading: n.(*Heading)}, nil + case LayoutImageType: + return &LayoutSlotChild{LayoutImage: n.(*LayoutImage)}, nil + default: + return nil, ErrInvalidChildType + } +} + type Link struct { Type string `json:"type"` - Children []*Phrasing `json:"children,omitempty"` + Children []*Phrasing `json:"children"` Data interface{} `json:"data,omitempty"` - Title string `json:"title,omitempty"` - URL string `json:"url,omitempty"` + Title string `json:"title"` + URL string `json:"url"` } func (n *Link) GetType() string { @@ -894,11 +1192,20 @@ func (n *Link) GetChildren() []Node { return result } +func (n *Link) AppendChild(child Node) error { + p, err := makePhrasing(child) + if err != nil { + return err + } + n.Children = append(n.Children, p) + return nil +} + type List struct { Type string `json:"type"` - Children []*ListItem `json:"children,omitempty"` + Children []*ListItem `json:"children"` Data interface{} `json:"data,omitempty"` - Ordered bool `json:"ordered,omitempty"` + Ordered bool `json:"ordered"` } func (n *List) GetType() string { @@ -917,9 +1224,18 @@ func (n *List) GetChildren() []Node { return result } +func (n *List) AppendChild(child Node) error { + // Keep strict: only accept ListItem + if child.GetType() != ListItemType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*ListItem)) + return nil +} + type ListItem struct { Type string `json:"type"` - Children []*ListItemChild `json:"children,omitempty"` + Children []*ListItemChild `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -939,6 +1255,15 @@ func (n *ListItem) GetChildren() []Node { return result } +func (n *ListItem) AppendChild(child Node) error { + c, err := makeListItemChild(child) + if err != nil { + return err + } + n.Children = append(n.Children, c) + return nil +} + type ListItemChild struct { *Paragraph *Text @@ -1003,6 +1328,8 @@ func (n *ListItemChild) GetChildren() []Node { return nil } +func (n *ListItemChild) AppendChild(child Node) error { return ErrCannotHaveChildren } + func (n *ListItemChild) UnmarshalJSON(data []byte) error { type node struct { Type string `json:"type"` @@ -1061,9 +1388,52 @@ func (n *ListItemChild) UnmarshalJSON(data []byte) error { return nil } +func (n *ListItemChild) MarshalJSON() ([]byte, error) { + switch { + case n.Paragraph != nil: + return json.Marshal(n.Paragraph) + case n.Text != nil: + return json.Marshal(n.Text) + case n.Break != nil: + return json.Marshal(n.Break) + case n.Strong != nil: + return json.Marshal(n.Strong) + case n.Emphasis != nil: + return json.Marshal(n.Emphasis) + case n.Strikethrough != nil: + return json.Marshal(n.Strikethrough) + case n.Link != nil: + return json.Marshal(n.Link) + default: + return []byte(`{}`), nil + } +} + +// Build a ListItemChild wrapper. +func makeListItemChild(n Node) (*ListItemChild, error) { + switch n.GetType() { + case ParagraphType: + return &ListItemChild{Paragraph: n.(*Paragraph)}, nil + case TextType: + return &ListItemChild{Text: n.(*Text)}, nil + case BreakType: + return &ListItemChild{Break: n.(*Break)}, nil + case StrongType: + return &ListItemChild{Strong: n.(*Strong)}, nil + case EmphasisType: + return &ListItemChild{Emphasis: n.(*Emphasis)}, nil + case StrikethroughType: + return &ListItemChild{Strikethrough: n.(*Strikethrough)}, nil + case LinkType: + return &ListItemChild{Link: n.(*Link)}, nil + default: + return nil, ErrInvalidChildType + } +} + type Paragraph struct { Type string `json:"type"` - Children []*Phrasing `json:"children,omitempty"` + Children []*Phrasing `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1083,6 +1453,15 @@ func (n *Paragraph) GetChildren() []Node { return result } +func (n *Paragraph) AppendChild(child Node) error { + p, err := makePhrasing(child) + if err != nil { + return err + } + n.Children = append(n.Children, p) + return nil +} + type Phrasing struct { *Text *Break @@ -1140,6 +1519,8 @@ func (n *Phrasing) GetChildren() []Node { return nil } +func (n *Phrasing) AppendChild(_ Node) error { return ErrCannotHaveChildren } + func (n *Phrasing) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -1189,11 +1570,50 @@ func (n *Phrasing) UnmarshalJSON(data []byte) error { return nil } +func (n *Phrasing) MarshalJSON() ([]byte, error) { + switch { + case n.Text != nil: + return json.Marshal(n.Text) + case n.Break != nil: + return json.Marshal(n.Break) + case n.Strong != nil: + return json.Marshal(n.Strong) + case n.Emphasis != nil: + return json.Marshal(n.Emphasis) + case n.Strikethrough != nil: + return json.Marshal(n.Strikethrough) + case n.Link != nil: + return json.Marshal(n.Link) + default: + return []byte(`{}`), nil + } +} + +// Build a Phrasing wrapper for paragraph/phrasing-bearing parents. +func makePhrasing(n Node) (*Phrasing, error) { + switch n.GetType() { + case TextType: + return &Phrasing{Text: n.(*Text)}, nil + case BreakType: + return &Phrasing{Break: n.(*Break)}, nil + case StrongType: + return &Phrasing{Strong: n.(*Strong)}, nil + case EmphasisType: + return &Phrasing{Emphasis: n.(*Emphasis)}, nil + case StrikethroughType: + return &Phrasing{Strikethrough: n.(*Strikethrough)}, nil + case LinkType: + return &Phrasing{Link: n.(*Link)}, nil + default: + return nil, ErrInvalidChildType + } +} + type Pullquote struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` - Source string `json:"source,omitempty"` - Text string `json:"text,omitempty"` + Source string `json:"source"` + Text string `json:"text"` } func (n *Pullquote) GetType() string { @@ -1208,13 +1628,15 @@ func (n *Pullquote) GetChildren() []Node { return nil } +func (n *Pullquote) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type Recommended struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` - Heading string `json:"heading,omitempty"` - ID string `json:"id,omitempty"` + Heading string `json:"heading"` + ID string `json:"id"` Teaser *Teaser `json:"teaser,omitempty"` - TeaserTitleOverride string `json:"teaserTitleOverride,omitempty"` + TeaserTitleOverride string `json:"teaserTitleOverride"` } func (n *Recommended) GetType() string { @@ -1229,9 +1651,11 @@ func (n *Recommended) GetChildren() []Node { return nil } +func (n *Recommended) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type ScrollyBlock struct { Type string `json:"type"` - Children []*ScrollySection `json:"children,omitempty"` + Children []*ScrollySection `json:"children"` Data interface{} `json:"data,omitempty"` Theme string `json:"theme,omitempty"` } @@ -1252,9 +1676,17 @@ func (n *ScrollyBlock) GetChildren() []Node { return result } +func (n *ScrollyBlock) AppendChild(child Node) error { + if child.GetType() != ScrollySectionType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*ScrollySection)) + return nil +} + type ScrollyCopy struct { Type string `json:"type"` - Children []*ScrollyCopyChild `json:"children,omitempty"` + Children []*ScrollyCopyChild `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1274,6 +1706,15 @@ func (n *ScrollyCopy) GetChildren() []Node { return result } +func (n *ScrollyCopy) AppendChild(child Node) error { + c, err := makeScrollyCopyChild(child) + if err != nil { + return err + } + n.Children = append(n.Children, c) + return nil +} + type ScrollyCopyChild struct { *Paragraph *ScrollyHeading @@ -1303,6 +1744,8 @@ func (n *ScrollyCopyChild) GetChildren() []Node { return nil } +func (n *ScrollyCopyChild) AppendChild(child Node) error { return ErrCannotHaveChildren } + func (n *ScrollyCopyChild) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -1328,9 +1771,32 @@ func (n *ScrollyCopyChild) UnmarshalJSON(data []byte) error { return nil } +func (n *ScrollyCopyChild) MarshalJSON() ([]byte, error) { + switch { + case n.Paragraph != nil: + return json.Marshal(n.Paragraph) + case n.ScrollyHeading != nil: + return json.Marshal(n.ScrollyHeading) + default: + return []byte(`{}`), nil + } +} + +// Build ScrollyCopyChild wrapper. +func makeScrollyCopyChild(n Node) (*ScrollyCopyChild, error) { + switch n.GetType() { + case ParagraphType: + return &ScrollyCopyChild{Paragraph: n.(*Paragraph)}, nil + case ScrollyHeadingType: + return &ScrollyCopyChild{ScrollyHeading: n.(*ScrollyHeading)}, nil + default: + return nil, ErrInvalidChildType + } +} + type ScrollyHeading struct { Type string `json:"type"` - Children []*Text `json:"children,omitempty"` + Children []*Text `json:"children"` Data interface{} `json:"data,omitempty"` Level string `json:"level,omitempty"` } @@ -1351,6 +1817,8 @@ func (n *ScrollyHeading) GetChildren() []Node { return result } +func (n *ScrollyHeading) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type ScrollyImage struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` @@ -1370,9 +1838,11 @@ func (n *ScrollyImage) GetChildren() []Node { return nil } +func (n *ScrollyImage) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type ScrollySection struct { Type string `json:"type"` - Children []*ScrollySectionChild `json:"children,omitempty"` + Children []*ScrollySectionChild `json:"children"` Data interface{} `json:"data,omitempty"` Display string `json:"display,omitempty"` NoBox bool `json:"noBox,omitempty"` @@ -1396,6 +1866,15 @@ func (n *ScrollySection) GetChildren() []Node { return result } +func (n *ScrollySection) AppendChild(child Node) error { + c, err := makeScrollySectionChild(child) + if err != nil { + return err + } + n.Children = append(n.Children, c) + return nil +} + type ScrollySectionChild struct { *ScrollyCopy *ScrollyImage @@ -1425,6 +1904,8 @@ func (n *ScrollySectionChild) GetChildren() []Node { return nil } +func (n *ScrollySectionChild) AppendChild(_ Node) error { return ErrCannotHaveChildren } + func (n *ScrollySectionChild) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -1450,9 +1931,32 @@ func (n *ScrollySectionChild) UnmarshalJSON(data []byte) error { return nil } +func (n *ScrollySectionChild) MarshalJSON() ([]byte, error) { + switch { + case n.ScrollyCopy != nil: + return json.Marshal(n.ScrollyCopy) + case n.ScrollyImage != nil: + return json.Marshal(n.ScrollyImage) + default: + return []byte(`{}`), nil + } +} + +// Build ScrollySectionChild wrapper. +func makeScrollySectionChild(n Node) (*ScrollySectionChild, error) { + switch n.GetType() { + case ScrollyCopyType: + return &ScrollySectionChild{ScrollyCopy: n.(*ScrollyCopy)}, nil + case ScrollyImageType: + return &ScrollySectionChild{ScrollyImage: n.(*ScrollyImage)}, nil + default: + return nil, ErrInvalidChildType + } +} + type Strikethrough struct { Type string `json:"type"` - Children []*Phrasing `json:"children,omitempty"` + Children []*Phrasing `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1472,9 +1976,18 @@ func (n *Strikethrough) GetChildren() []Node { return result } +func (n *Strikethrough) AppendChild(child Node) error { + p, err := makePhrasing(child) + if err != nil { + return err + } + n.Children = append(n.Children, p) + return nil +} + type Strong struct { Type string `json:"type"` - Children []*Phrasing `json:"children,omitempty"` + Children []*Phrasing `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1494,9 +2007,18 @@ func (n *Strong) GetChildren() []Node { return result } +func (n *Strong) AppendChild(child Node) error { + p, err := makePhrasing(child) + if err != nil { + return err + } + n.Children = append(n.Children, p) + return nil +} + type Table struct { Type string `json:"type"` - Children []*TableChild `json:"children,omitempty"` + Children []*TableChild `json:"children"` CollapseAfterHowManyRows float64 `json:"collapseAfterHowManyRows,omitempty"` ColumnSettings []*ColumnSettingsItems `json:"columnSettings,omitempty"` Compact bool `json:"compact,omitempty"` @@ -1522,6 +2044,15 @@ func (n *Table) GetChildren() []Node { return result } +func (n *Table) AppendChild(child Node) error { + c, err := makeTableChild(child) + if err != nil { + return err + } + n.Children = append(n.Children, c) + return nil +} + type TableChild struct { *TableCaption *TableBody @@ -1558,6 +2089,8 @@ func (n *TableChild) GetChildren() []Node { return nil } +func (n *TableChild) AppendChild(_ Node) error { return ErrCannotHaveChildren } + func (n *TableChild) UnmarshalJSON(data []byte) error { var tn typedNode if err := json.Unmarshal(data, &tn); err != nil { @@ -1589,9 +2122,36 @@ func (n *TableChild) UnmarshalJSON(data []byte) error { return nil } +func (n *TableChild) MarshalJSON() ([]byte, error) { + switch { + case n.TableCaption != nil: + return json.Marshal(n.TableCaption) + case n.TableBody != nil: + return json.Marshal(n.TableBody) + case n.TableFooter != nil: + return json.Marshal(n.TableFooter) + default: + return []byte(`{}`), nil + } +} + +// Build TableChild wrapper. +func makeTableChild(n Node) (*TableChild, error) { + switch n.GetType() { + case TableCaptionType: + return &TableChild{TableCaption: n.(*TableCaption)}, nil + case TableBodyType: + return &TableChild{TableBody: n.(*TableBody)}, nil + case TableFooterType: + return &TableChild{TableFooter: n.(*TableFooter)}, nil + default: + return nil, ErrInvalidChildType + } +} + type TableBody struct { Type string `json:"type"` - Children []*TableRow `json:"children,omitempty"` + Children []*TableRow `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1611,9 +2171,17 @@ func (n *TableBody) GetChildren() []Node { return result } +func (n *TableBody) AppendChild(child Node) error { + if child.GetType() != TableRowType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*TableRow)) + return nil +} + type TableCaption struct { Type string `json:"type"` - Children []*Table `json:"children,omitempty"` + Children []*Table `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1633,9 +2201,17 @@ func (n *TableCaption) GetChildren() []Node { return result } +func (n *TableCaption) AppendChild(child Node) error { + if child.GetType() != TableType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*Table)) + return nil +} + type TableCell struct { Type string `json:"type"` - Children []*Table `json:"children,omitempty"` + Children []*Table `json:"children"` Data interface{} `json:"data,omitempty"` Heading bool `json:"heading,omitempty"` } @@ -1656,9 +2232,17 @@ func (n *TableCell) GetChildren() []Node { return result } +func (n *TableCell) AppendChild(child Node) error { + if child.GetType() != TableType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*Table)) + return nil +} + type TableFooter struct { Type string `json:"type"` - Children []*Table `json:"children,omitempty"` + Children []*Table `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1678,9 +2262,17 @@ func (n *TableFooter) GetChildren() []Node { return result } +func (n *TableFooter) AppendChild(child Node) error { + if child.GetType() != TableType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*Table)) + return nil +} + type TableRow struct { Type string `json:"type"` - Children []*TableCell `json:"children,omitempty"` + Children []*TableCell `json:"children"` Data interface{} `json:"data,omitempty"` } @@ -1700,6 +2292,14 @@ func (n *TableRow) GetChildren() []Node { return result } +func (n *TableRow) AppendChild(child Node) error { + if child.GetType() != TableCellType { + return ErrInvalidChildType + } + n.Children = append(n.Children, child.(*TableCell)) + return nil +} + type Text struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` @@ -1718,6 +2318,8 @@ func (n *Text) GetChildren() []Node { return nil } +func (n *Text) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type ThematicBreak struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` @@ -1735,6 +2337,8 @@ func (n *ThematicBreak) GetChildren() []Node { return nil } +func (n *ThematicBreak) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type Tweet struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` @@ -1754,10 +2358,12 @@ func (n *Tweet) GetChildren() []Node { return nil } +func (n *Tweet) AppendChild(child Node) error { return ErrCannotHaveChildren } + type Video struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` - ID string `json:"id,omitempty"` + ID string `json:"id"` } func (n *Video) GetType() string { @@ -1772,6 +2378,8 @@ func (n *Video) GetChildren() []Node { return nil } +func (n *Video) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type YoutubeVideo struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` @@ -1790,11 +2398,13 @@ func (n *YoutubeVideo) GetChildren() []Node { return nil } +func (n *YoutubeVideo) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type CustomCodeComponent struct { Type string `json:"type"` Data interface{} `json:"data,omitempty"` - ID string `json:"id,omitempty"` - LayoutWidth string `json:"layoutWidth,omitempty"` + ID string `json:"id"` + LayoutWidth string `json:"layoutWidth"` Attributes map[string]interface{} `json:"attributes,omitempty"` AttributesLastModified string `json:"attributesLastModified,omitempty"` Path string `json:"path,omitempty"` @@ -1813,6 +2423,32 @@ func (n *CustomCodeComponent) GetChildren() []Node { return nil } +func (n *CustomCodeComponent) AppendChild(_ Node) error { return ErrCannotHaveChildren } + +type ClipSet struct { + Type string `json:"type"` + Data interface{} `json:"data,omitempty"` + ID string `json:"id,omitempty"` + LayoutWidth string `json:"layoutWidth,omitempty"` + Autoplay bool `json:"autoplay,omitempty"` + Loop bool `json:"loop,omitempty"` + Muted bool `json:"muted,omitempty"` +} + +func (n *ClipSet) GetType() string { + return n.Type +} + +func (n *ClipSet) GetEmbedded() Node { + return nil +} + +func (n *ClipSet) GetChildren() []Node { + return nil +} + +func (n *ClipSet) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type FallbackImage struct { Format string `json:"format,omitempty"` Height float64 `json:"height,omitempty"` @@ -1924,6 +2560,8 @@ func (n *Root) GetChildren() []Node { return nil } +func (n *Root) AppendChild(_ Node) error { return ErrCannotHaveChildren } + type SourceSetItems struct { Dpr float64 `json:"dpr,omitempty"` URL string `json:"url,omitempty"` diff --git a/go.mod b/go.mod index caddc00..0a93172 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,7 @@ module github.com/Financial-Times/content-tree -go 1.23.5 +go 1.25 + +require github.com/beevik/etree v1.6.0 + +require github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1 diff --git a/go.sum b/go.sum index e69de29..9bd5ab4 100644 --- a/go.sum +++ b/go.sum @@ -0,0 +1,4 @@ +github.com/beevik/etree v1.6.0 h1:u8Kwy8pp9D9XeITj2Z0XtA5qqZEmtJtuXZRQi+j03eE= +github.com/beevik/etree v1.6.0/go.mod h1:bh4zJxiIr62SOf9pRzN7UUYaEDa9HEKafK25+sLc0Gc= +github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1 h1:dOYG7LS/WK00RWZc8XGgcUTlTxpp3mKhdR2Q9z9HbXM= +github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1/go.mod h1:mpRZBD8SJ55OIICQ3iWH0Yz3cjzA61JdqMLoWXeB2+8= diff --git a/libraries/from-bodyxml/go/README.md b/libraries/from-bodyxml/go/README.md new file mode 100644 index 0000000..3f3cca3 --- /dev/null +++ b/libraries/from-bodyxml/go/README.md @@ -0,0 +1,31 @@ +# XML to Content Tree Transformer + +## Overview +The Transformer converts external XHTML-formatted document into content tree. +It supports format stored in the **internalComponent** collection as well as the one returned by the **Internal Content API**. +The latter is produced by the content-public-read service after applying certain transformations to the bodyXML it retrieves from the internalComponents collection. +These transformations include renaming the content, related, and concept tags to ft-content, ft-related, and ft-concept, respectively, and replacing the id attribute with url, with a few caveats. + +## Usage + +```go +package main + +import ( + "fmt" + "log" + + tocontenttree "github.com/Financial-Times/content-tree" +) + +func main() { + xmlInput := `

Hello World

` + + out, err := tocontenttree.Transform(xmlInput) + if err != nil { + log.Fatalf("Transform (XmlToTree) failed: %v", err) + } + + fmt.Printf("Transformed content tree: %+v\n", out) +} +``` \ No newline at end of file diff --git a/libraries/from-bodyxml/go/helpers.go b/libraries/from-bodyxml/go/helpers.go new file mode 100644 index 0000000..22f3bd0 --- /dev/null +++ b/libraries/from-bodyxml/go/helpers.go @@ -0,0 +1,104 @@ +package tocontenttree + +import ( + "strings" + + "github.com/beevik/etree" +) + +type layoutwidth string + +func toValidLayoutWidth(w string) layoutwidth { + switch w { + case "auto", "in-line", "inset-left", "inset-right", + "full-bleed", "full-grid", "mid-grid", "full-width": + return layoutwidth(w) + default: + return "full-width" + } +} + +func toValidClipLayoutWidth(w string) layoutwidth { + switch w { + case "in-line", "full-grid", "mid-grid": + return layoutwidth(w) + default: + return "in-line" + } + +} + +func findChild(el *etree.Element, tag string) *etree.Element { + for _, ch := range el.ChildElements() { + if ch.Tag == tag { + return ch + } + if found := findChild(ch, tag); found != nil { + return found + } + } + return nil +} + +func textContent(el *etree.Element) string { + var b strings.Builder + for _, tok := range el.Child { + switch t := tok.(type) { + case *etree.CharData: + b.WriteString(t.Data) + case *etree.Element: + b.WriteString(textContent(t)) + } + } + return b.String() +} + +func flattenedChildren(el *etree.Element) []etree.Token { + out := make([]etree.Token, 0, len(el.Child)) + for _, tok := range el.Child { + if d, ok := tok.(*etree.Element); ok && d.Tag == "div" { + out = append(out, d.Child...) + } else { + out = append(out, tok) + } + } + return out +} + +func valueOr(v, fallback string) string { + if v != "" { + return v + } + return fallback +} + +func attr(el *etree.Element, name string) string { + return el.SelectAttrValue(name, "") +} + +var contentTypeTemplates = map[string]string{ + "http://www.ft.com/ontology/content/Article": "/content/{{id}}", + "http://www.ft.com/ontology/content/ImageSet": "/content/{{id}}", + "http://www.ft.com/ontology/content/ClipSet": "/content/{{id}}", + "http://www.ft.com/ontology/content/CustomCodeComponent": "/content/{{id}}", + "http://www.ft.com/ontology/content/MediaResource": "/content/{{id}}", + "http://www.ft.com/ontology/content/Video": "/content/{{id}}", + "http://www.ft.com/ontology/company/PublicCompany": "/organisations/{{id}}", + "http://www.ft.com/ontology/content/ContentPackage": "/content/{{id}}", + "http://www.ft.com/ontology/content/Content": "/content/{{id}}", + "http://www.ft.com/ontology/content/Image": "/content/{{id}}", + "http://www.ft.com/ontology/content/DynamicContent": "/content/{{id}}", + "http://www.ft.com/ontology/content/Graphic": "/content/{{id}}", + "http://www.ft.com/ontology/content/Audio": "/content/{{id}}", + "http://www.ft.com/ontology/company/Organisation": "/organisations/{{id}}", +} + +func generateUrl(t, id string) string { + const host = "http://api.ft.com" + template, ok := contentTypeTemplates[t] + if !ok { + return "" + } + path := strings.Replace(template, "{{id}}", id, 1) + return host + path +} diff --git a/libraries/from-bodyxml/go/html_transformers.go b/libraries/from-bodyxml/go/html_transformers.go new file mode 100644 index 0000000..dcc8d6f --- /dev/null +++ b/libraries/from-bodyxml/go/html_transformers.go @@ -0,0 +1,366 @@ +package tocontenttree + +import ( + "strings" + + contenttree "github.com/Financial-Times/content-tree" + "github.com/beevik/etree" +) + +type unknownNode struct { + Type string `json:"type"` + Data *etree.Element `json:"data,omitempty"` + Class string `json:"class,omitempty"` +} + +func (n *unknownNode) GetType() string { return n.Type } +func (n *unknownNode) GetEmbedded() contenttree.Node { return nil } +func (n *unknownNode) GetChildren() []contenttree.Node { return nil } +func (n *unknownNode) AppendChild(_ contenttree.Node) error { return contenttree.ErrCannotHaveChildren } + +func newUnknownNode(class string, data *etree.Element) *unknownNode { + return &unknownNode{ + Type: "__UNKNOWN__", + Class: class, + Data: data, + } +} + +type liftChildrenNode struct { + Type string `json:"type"` + Data interface{} `json:"data,omitempty"` + Class string `json:"class,omitempty"` +} + +func (n *liftChildrenNode) GetType() string { return n.Type } +func (n *liftChildrenNode) GetEmbedded() contenttree.Node { return nil } +func (n *liftChildrenNode) GetChildren() []contenttree.Node { return nil } +func (n *liftChildrenNode) AppendChild(child contenttree.Node) error { + return contenttree.ErrCannotHaveChildren +} + +func newLiftChildrenNode() *liftChildrenNode { + return &liftChildrenNode{ + Type: "__LIFT_CHILDREN__", + } +} + +var contentType = struct { + ImageSet string + Video string + Content string + Article string + CustomCodeComponent string + ClipSet string +}{ + ImageSet: "http://www.ft.com/ontology/content/ImageSet", + Video: "http://www.ft.com/ontology/content/Video", + Content: "http://www.ft.com/ontology/content/Content", + Article: "http://www.ft.com/ontology/content/Article", + CustomCodeComponent: "http://www.ft.com/ontology/content/CustomCodeComponent", + ClipSet: "http://www.ft.com/ontology/content/ClipSet", +} + +type transformer func(el *etree.Element) contenttree.Node + +var defaultTransformers = map[string]transformer{ + "h1": func(h1 *etree.Element) contenttree.Node { + dfrgId := valueOr(attr(h1, "data-fragment-identifier"), attr(h1, "id")) + heading := &contenttree.Heading{ + Type: contenttree.HeadingType, + Level: "chapter", + Children: []*contenttree.Text{}, + FragmentIdentifier: dfrgId, + } + return heading + }, + "h2": func(h2 *etree.Element) contenttree.Node { + dfrgId := valueOr(attr(h2, "data-fragment-identifier"), attr(h2, "id")) + return &contenttree.Heading{ + Type: contenttree.HeadingType, + Level: "subheading", + Children: []*contenttree.Text{}, + FragmentIdentifier: dfrgId, + } + }, + "h3": func(h3 *etree.Element) contenttree.Node { + dfrgId := valueOr(attr(h3, "data-fragment-identifier"), attr(h3, "id")) + return &contenttree.Heading{ + Type: contenttree.HeadingType, + Level: "subheading", + Children: []*contenttree.Text{}, + FragmentIdentifier: dfrgId, + } + }, + "h4": func(h4 *etree.Element) contenttree.Node { + dfrgId := valueOr(attr(h4, "data-fragment-identifier"), attr(h4, "id")) + return &contenttree.Heading{ + Type: contenttree.HeadingType, + Level: "label", + Children: []*contenttree.Text{}, + FragmentIdentifier: dfrgId, + } + }, + "p": func(p *etree.Element) contenttree.Node { + return &contenttree.Paragraph{ + Type: contenttree.ParagraphType, + Children: []*contenttree.Phrasing{}, + } + }, + "em": func(em *etree.Element) contenttree.Node { + return &contenttree.Emphasis{ + Type: contenttree.EmphasisType, + Children: []*contenttree.Phrasing{}, + } + }, + "strong": func(strong *etree.Element) contenttree.Node { + return &contenttree.Strong{ + Type: contenttree.StrongType, + Children: []*contenttree.Phrasing{}, + } + }, + "s": func(s *etree.Element) contenttree.Node { + return &contenttree.Strikethrough{ + Type: contenttree.StrikethroughType, + Children: []*contenttree.Phrasing{}, + } + }, + "br": func(br *etree.Element) contenttree.Node { + return &contenttree.Break{ + Type: contenttree.BreakType, + } + }, + "hr": func(hr *etree.Element) contenttree.Node { + return &contenttree.ThematicBreak{ + Type: contenttree.ThematicBreakType, + } + }, + "a": func(a *etree.Element) contenttree.Node { + if attr(a, "data-asset-type") == "video" { + url := attr(a, "href") + if strings.Contains(url, "youtube.com") { + return &contenttree.YoutubeVideo{ + Type: contenttree.YoutubeVideoType, + URL: url, + } + } + // NOTE: Vimeo not yet in spec + } else if attr(a, "data-asset-type") == "tweet" { + url := attr(a, "href") + return &contenttree.Tweet{ + Type: contenttree.TweetType, + ID: url, + } + } + return &contenttree.Link{ + Type: contenttree.LinkType, + Title: attr(a, "title"), + URL: attr(a, "href"), + Children: []*contenttree.Phrasing{}, + } + }, + "ol": func(ol *etree.Element) contenttree.Node { + return &contenttree.List{ + Type: contenttree.ListType, + Ordered: true, + Children: []*contenttree.ListItem{}, + } + }, + "ul": func(ul *etree.Element) contenttree.Node { + return &contenttree.List{ + Type: contenttree.ListType, + Ordered: false, + Children: []*contenttree.ListItem{}, + } + }, + "li": func(li *etree.Element) contenttree.Node { + return &contenttree.ListItem{ + Type: contenttree.ListItemType, + Children: []*contenttree.ListItemChild{}, + } + }, + "blockquote": func(bq *etree.Element) contenttree.Node { + return &contenttree.Blockquote{ + Type: contenttree.BlockquoteType, + Children: []*contenttree.BlockquoteChild{}, + } + }, + "pull-quote": func(pq *etree.Element) contenttree.Node { + textEl := findChild(pq, "pull-quote-text") + sourceEl := findChild(pq, "pull-quote-source") + return &contenttree.Pullquote{ + Type: contenttree.PullquoteType, + Text: func() string { + if textEl != nil { + return textContent(textEl) + } + return "" + }(), + Source: func() string { + if sourceEl != nil { + return textContent(sourceEl) + } + return "" + }(), + } + }, + "big-number": func(bn *etree.Element) contenttree.Node { + numEl := findChild(bn, "big-number-headline") + descEl := findChild(bn, "big-number-intro") + return &contenttree.BigNumber{ + Type: contenttree.BigNumberType, + Number: func() string { + if numEl != nil { + return textContent(numEl) + } + return "" + }(), + Description: func() string { + if descEl != nil { + return textContent(descEl) + } + return "" + }(), + } + }, + "img": func(img *etree.Element) contenttree.Node { + return &contenttree.LayoutImage{ + Type: contenttree.LayoutImageType, + ID: attr(img, "src"), + Credit: attr(img, "data-copyright"), + Alt: attr(img, "alt"), + Caption: attr(img, "longdesc"), + } + }, + + contentType.ImageSet: func(content *etree.Element) contenttree.Node { + dfrgId := valueOr(attr(content, "data-fragment-identifier"), attr(content, "id")) + return &contenttree.ImageSet{ + Type: contenttree.ImageSetType, + ID: attr(content, "url"), + FragmentIdentifier: dfrgId, + } + }, + contentType.Video: func(content *etree.Element) contenttree.Node { + return &contenttree.Video{ + Type: contenttree.VideoType, + ID: attr(content, "url"), + } + }, + contentType.Content: func(content *etree.Element) contenttree.Node { + id := attr(content, "url") + parts := strings.Split(id, "/") + uuid := "" + if len(parts) > 0 { + uuid = parts[len(parts)-1] + } + dfrgId := valueOr(attr(content, "data-fragment-identifier"), attr(content, "id")) + if attr(content, "data-asset-type") == "flourish" { + return &contenttree.Flourish{ + Type: contenttree.FlourishType, + Id: uuid, + FlourishType: attr(content, "data-flourish-type"), + LayoutWidth: string(toValidLayoutWidth(attr(content, "data-layout-width"))), + Description: attr(content, "alt"), + Timestamp: attr(content, "data-time-stamp"), + FragmentIdentifier: dfrgId, + } + } + return &contenttree.Link{ + Type: contenttree.LinkType, + URL: "https://www.ft.com/content/" + uuid, + Title: attr(content, "dataTitle"), + Children: []*contenttree.Phrasing{}, + } + }, + contentType.Article: func(content *etree.Element) contenttree.Node { + id := attr(content, "url") + parts := strings.Split(id, "/") + uuid := "" + if len(parts) > 0 { + uuid = parts[len(parts)-1] + } + return &contenttree.Link{ + Type: contenttree.LinkType, + URL: "https://www.ft.com/content/" + uuid, + Title: attr(content, "dataTitle"), + Children: []*contenttree.Phrasing{}, + } + }, + contentType.CustomCodeComponent: func(content *etree.Element) contenttree.Node { + id := attr(content, "url") + parts := strings.Split(id, "/") + uuid := "" + if len(parts) > 0 { + uuid = parts[len(parts)-1] + } + return &contenttree.CustomCodeComponent{ + Type: contenttree.CustomCodeComponentType, + ID: uuid, + LayoutWidth: string(toValidLayoutWidth(attr(content, "data-layout-width"))), + } + }, + contentType.ClipSet: func(content *etree.Element) contenttree.Node { + id := attr(content, "url") + parts := strings.Split(id, "/") + uuid := "" + if len(parts) > 0 { + uuid = parts[len(parts)-1] + } + return &contenttree.ClipSet{ + Type: contenttree.ClipSetType, + ID: uuid, + LayoutWidth: string(toValidClipLayoutWidth(attr(content, "data-layout-width"))), + Autoplay: attr(content, "autoplay") == "true", + Loop: attr(content, "loop") == "true", + Muted: attr(content, "muted") == "true", + } + }, + "recommended": func(rl *etree.Element) contenttree.Node { + id := "" + teaser := "" + if link := findChild(rl, "content"); link != nil { + id = generateUrl(attr(link, "type"), attr(link, "id")) + teaser = textContent(link) + } else if link := findChild(rl, "ft-content"); link != nil { + id = attr(link, "url") + teaser = textContent(link) + } + heading := findChild(rl, "recommended-title") + return &contenttree.Recommended{ + Type: contenttree.RecommendedType, + ID: id, + Heading: func() string { + if heading != nil { + return textContent(heading) + } + return "" + }(), + TeaserTitleOverride: teaser, + } + }, + "div": func(div *etree.Element) contenttree.Node { + switch attr(div, "class") { + case "n-content-layout": + return &contenttree.Layout{ + Type: contenttree.LayoutType, + LayoutName: valueOr(attr(div, "data-layout-name"), "auto"), + LayoutWidth: string(toValidLayoutWidth(attr(div, "data-layout-width"))), + Children: []*contenttree.LayoutChild{}, + } + case "n-content-layout__container": + return newLiftChildrenNode() + case "n-content-layout__slot": + div.Child = flattenedChildren(div) + return &contenttree.LayoutSlot{ + Type: contenttree.LayoutSlotType, + Children: []*contenttree.LayoutSlotChild{}, + } + default: + return newUnknownNode(attr(div, "class"), div) + } + }, + "experimental": func(_ *etree.Element) contenttree.Node { + return newLiftChildrenNode() + }, +} diff --git a/libraries/from-bodyxml/go/transform.go b/libraries/from-bodyxml/go/transform.go new file mode 100644 index 0000000..a3cb33b --- /dev/null +++ b/libraries/from-bodyxml/go/transform.go @@ -0,0 +1,134 @@ +package tocontenttree + +import ( + "fmt" + "io" + "strings" + + contenttree "github.com/Financial-Times/content-tree" + "github.com/beevik/etree" +) + +// Transform converts an external XHTML-formatted document into a content tree. +// It returns an error if the input contains unsupported HTML elements +// or does not comply with the content tree schema. +func Transform(bodyXML string) (*contenttree.Root, error) { + return fromETreeReader(strings.NewReader(bodyXML)) +} + +func fromETreeReader(r io.Reader) (*contenttree.Root, error) { + doc := etree.NewDocument() + _, err := doc.ReadFrom(r) + if err != nil { + return nil, err + } + + root := doc.Root() + if root == nil { + return nil, fmt.Errorf("no root element found") + } + + m := &contenttree.Body{Type: contenttree.BodyType, Version: 1} + err = convertToContentTree(root, m) + if err != nil { + return nil, err + } + + out := &contenttree.Root{ + Type: contenttree.RootType, + Body: m, + } + + return out, nil +} + +func convertToContentTree(elem etree.Token, m contenttree.Node) error { + switch t := elem.(type) { + case *etree.Element: + if t.Tag == "body" { + for _, child := range t.Child { + err := convertToContentTree(child, m) + if err != nil { + return err + } + } + return nil + } + + if t.Tag == "content" || t.Tag == "related" || t.Tag == "concept" { + id := attr(t, "id") + typeAttr := attr(t, "type") + if id != "" { + t.CreateAttr("url", generateUrl(typeAttr, id)) + if attr(t, "data-asset-type") != "flourish" { + t.RemoveAttr("id") + } + } + } + + tag := t.Tag + if t.Tag == "content" || t.Tag == "ft-content" { + for _, attr := range t.Attr { + if attr.Key == "type" { + tag = attr.Value + break + } + } + } + + transformer, ok := defaultTransformers[tag] + if !ok { + return fmt.Errorf("unknownNode transformer for tag <%s>", t.Tag) + } + + switch transformed := transformer(t).(type) { + case *unknownNode: + { + return fmt.Errorf("unknownNode div node with class '%s'", transformed.Class) + } + case *liftChildrenNode: + { + for _, child := range t.Child { + err := convertToContentTree(child, m) + if err != nil { + return err + } + } + return nil + } + default: + { + err := m.AppendChild(transformed) + if err != nil { + return fmt.Errorf( + "failed to append transformed child of type <%s> for parent <%s>: %w", + transformed.GetType(), + m.GetType(), + err, + ) + } + if transformed.GetChildren() != nil { + for _, child := range t.Child { + err := convertToContentTree(child, transformed) + if err != nil { + return err + } + } + } + return nil + } + + } + case *etree.CharData: + data := t.Data + tx := &contenttree.Text{ + Value: data, + Type: contenttree.TextType, + } + err := m.AppendChild(tx) + if err != nil { + return err + } + } + return nil +} diff --git a/libraries/from-bodyxml/go/transform_test.go b/libraries/from-bodyxml/go/transform_test.go new file mode 100644 index 0000000..fa8516c --- /dev/null +++ b/libraries/from-bodyxml/go/transform_test.go @@ -0,0 +1,95 @@ +package tocontenttree + +import ( + "encoding/json" + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/nsf/jsondiff" +) + +func TestTransform(t *testing.T) { + for _, test := range getTestCases(t) { + t.Run(test.name, func(t *testing.T) { + bodyTree, err := Transform(test.input) + + if err != nil && !test.wantErr { + t.Errorf("Failed with unexpected error: %v", err) + } + if err != nil && test.wantErr { + return + } + + want := strings.TrimSpace(test.output) + got, err := json.Marshal(bodyTree) + opts := jsondiff.DefaultJSONOptions() + diffQuery, _ := jsondiff.Compare(got, []byte(want), &opts) + if diffQuery != jsondiff.FullMatch { + t.Errorf("got: %s\n\n want: %s\n", got, want) + } + }) + } +} + +type TestCase struct { + name string + input string + output string + wantErr bool +} + +func getTestCases(t *testing.T) []TestCase { + t.Helper() + + inputPath := "../../../tests/bodyxml-to-content-tree/input" + outputPath := "../../../tests/bodyxml-to-content-tree/output" + + entries, err := os.ReadDir(inputPath) + if err != nil { + t.Fatal(err) + } + + testCases := make([]TestCase, 0, len(entries)) + + for _, entry := range entries { + if entry.IsDir() { + continue + } + + inputFile := filepath.Join(inputPath, entry.Name()) + + input, err := os.ReadFile(inputFile) + if err != nil { + t.Fatalf("Failed to read file %s: %s", inputFile, err) + } + + caseName := strings.TrimSuffix(entry.Name(), filepath.Ext(entry.Name())) + outputFile := filepath.Join(outputPath, caseName+".json") + + if _, err := os.Stat(outputFile); errors.Is(err, os.ErrNotExist) { + testCases = append(testCases, TestCase{ + name: caseName, + input: string(input), + output: "", + wantErr: true, + }) + } else { + output, err := os.ReadFile(outputFile) + if err != nil { + t.Fatalf("Failed to read file %s: %s", outputFile, err) + } + + testCases = append(testCases, TestCase{ + name: caseName, + input: string(input), + output: string(output), + wantErr: false, + }) + } + } + + return testCases +} diff --git a/libraries/from-bodyxml/index.js b/libraries/from-bodyxml/index.js index 8afb2cd..417b290 100644 --- a/libraries/from-bodyxml/index.js +++ b/libraries/from-bodyxml/index.js @@ -56,7 +56,7 @@ export let defaultTransformers = { * @type {Transformer} */ h1(h1) { - const fragmentId = h1.attributes["data-fragment-id"] || h1.attributes["id"]; + const fragmentId = h1.attributes["data-fragment-identifier"] || h1.attributes["id"]; return { type: "heading", level: "chapter", @@ -67,7 +67,7 @@ export let defaultTransformers = { * @type {Transformer} */ h2(h2) { - const fragmentId = h2.attributes["data-fragment-id"] || h2.attributes["id"]; + const fragmentId = h2.attributes["data-fragment-identifier"] || h2.attributes["id"]; return { type: "heading", level: "subheading", @@ -78,7 +78,7 @@ export let defaultTransformers = { * @type {Transformer} */ h3(h3) { - const fragmentId = h3.attributes["data-fragment-id"] || h3.attributes["id"]; + const fragmentId = h3.attributes["data-fragment-identifier"] || h3.attributes["id"]; return { type: "heading", level: "subheading", @@ -89,7 +89,7 @@ export let defaultTransformers = { * @type {Transformer} */ h4(h4) { - const fragmentId = h4.attributes["data-fragment-id"] || h4.attributes["id"]; + const fragmentId = h4.attributes["data-fragment-identifier"] || h4.attributes["id"]; return { type: "heading", level: "label", @@ -245,7 +245,7 @@ export let defaultTransformers = { * @type {Transformer} */ [ContentType.imageset](content) { - const fragmentId = content.attributes["data-fragment-id"] || content.attributes["id"]; + const fragmentId = content.attributes["data-fragment-identifier"] || content.attributes["id"]; return { type: "image-set", id: content.attributes.url ?? "", @@ -270,7 +270,7 @@ export let defaultTransformers = { [ContentType.content](content) { const id = content.attributes.url ?? ""; const uuid = id.split("/").pop(); - const fragmentId = content.attributes["data-fragment-id"] || content.attributes["id"]; + const fragmentId = content.attributes["data-fragment-identifier"] || content.attributes["id"]; if (content.attributes["data-asset-type"] == "flourish") { return /** @type {ContentTree.transit.Flourish} */ ({