11#!/usr/bin/env python3
2- """build_whats_new.py – regenerate RavenDB *What's New* pages
3-
4- The script lives in the project's **/scripts** folder, therefore the Docusaurus
5- root is assumed to be its parent directory (``../``).
2+ """build_whats_new.py – regenerate RavenDB *What's New* pages
63
74What the script does
85--------------------
961. **Downloads** changelog entries for one or more RavenDB branches via the
107 public Documentation API.
11- 2. **Converts** each entry's HTML body to Markdown using *markdownify*.
12- 3. **Sorts** the entries strictly by their ``buildDate`` field (newest → oldest).
13- 4. **Writes** them to ``whats-new.mdx`` files with front‑matter already in place.
8+ 2. **Sorts** the entries strictly by their ``buildDate`` field (newest → oldest).
9+ 3. **Writes** them to ``whats-new.mdx`` files with front-matter already in place.
10+ 4. **Escapes** raw angle brackets outside code (inline/fenced), preserving only
11+ `<hr />` (and `<hr>` / `<hr/>`) as real HTML; everything else is escaped.
12+ Existing `<` / `>` aren’t double-escaped.
13+ Also logs any tag-like snippets that were escaped.
1414
1515File locations
1616--------------
2525
2626Environment variable
2727--------------------
28- Set your RavenDB docs API key in ``API_WEB_RAVENDB_NET_HOST `` before running.
28+ Set API endpoint in ``WHATS_NEW_URL `` before running.
2929
3030Examples
3131--------
3838
3939from __future__ import annotations
4040
41- import os
4241import re
4342import sys
43+ from collections import Counter
4444from datetime import datetime
4545from pathlib import Path
4646from typing import List , Dict , Any
5959SCRIPT_DIR = Path (__file__ ).resolve ().parent
6060PROJECT_ROOT = SCRIPT_DIR .parent # «../» relative to /scripts
6161
62- # Docusaurus front‑ matter block that prefixes every generated MDX file
62+ # Docusaurus front- matter block that prefixes every generated MDX file
6363FRONT_MATTER = (
6464 "---\n "
6565 'title: "What\' s New"\n '
@@ -81,9 +81,7 @@ def get_api_page(branch: str, page: int = 1) -> Dict[str, Any]:
8181 """Return a single paginated payload from the Documentation API."""
8282 response = requests .get (
8383 API_BASE_URL ,
84- headers = {
85- "Accept" : "application/json" ,
86- },
84+ headers = {"Accept" : "application/json" },
8785 params = {"version" : branch , "page" : page },
8886 timeout = 20 ,
8987 )
@@ -114,26 +112,106 @@ def fetch_branch_entries(branch: str) -> List[Dict[str, Any]]:
114112
115113 return entries
116114
115+ # ============================================================================
116+ # Escaping helpers (whitelist only <hr>, log tag-like escapes)
117+ # ============================================================================
118+
119+ # fenced code blocks (``` or ~~~), with optional info string
120+ _FENCE_RE = re .compile (r"(^|\n)(?P<fence>```|~~~)[^\n]*\n.*?\n(?P=fence)(?=\n|$)" , re .DOTALL )
121+ # inline code spans
122+ _INLINE_CODE_RE = re .compile (r"`[^`]*`" )
123+ # tag-like matcher; allows attributes, self-closing, etc.
124+ _HTML_TAG_RE = re .compile (r"</?\s*([A-Za-z][A-Za-z0-9:-]*)\b(?:\s+[^<>]*?)?/?>" )
125+ # '###Server' -> '### Server'
126+ _HEADING_SPACE_RE = re .compile (r"(?m)^(#{1,6})(?!\s|#)" )
127+ # whitelist: keep only <hr>, <hr/>, <hr /> (case-insensitive)
128+ _WHITELIST_TAGS = {"hr" , "code" }
129+
130+ # per-run log of escaped tag-like snippets
131+ _ESCAPED_TAG_EVENTS : list [str ] = []
132+
133+ def _log_tag_escape (snippet : str ) -> None :
134+ # keep the literal snippet for reporting
135+ _ESCAPED_TAG_EVENTS .append (snippet )
136+
137+ def _escape_angles (text : str ) -> str :
138+ return text .replace ("<" , "<" ).replace (">" , ">" )
139+
140+ def _escape_preserving_hr_only (text : str ) -> str :
141+ """Escape < and > in plain text, but keep only `<hr>` variants as HTML.
142+ Any other tag-like snippet (e.g., <T>, <div>, <Foo>) is escaped & logged.
143+ """
144+ out , last = [], 0
145+ for match in _HTML_TAG_RE .finditer (text ):
146+ # escape plain text before the tag-like match
147+ out .append (_escape_angles (text [last :match .start ()]))
148+
149+ tag_full = match .group (0 ) # matched groups from regex
150+ tag_name = match .group (1 ).lower () if match .group (1 ) else "" # to check if it isn't whitelisted e.g. <hr>
151+
152+ if tag_name in _WHITELIST_TAGS :
153+ out .append (tag_full ) # keep <hr> / <hr/> / <hr />
154+ else :
155+ _log_tag_escape (tag_full )
156+ out .append (_escape_angles (tag_full )) # escape non-whitelisted tag-like text
157+
158+ last = match .end ()
159+
160+ out .append (_escape_angles (text [last :]))
161+ return "" .join (out )
162+
163+ def _escape_outside_inline_code (text : str ) -> str :
164+ """Within non-fenced areas, escape outside inline code spans."""
165+ out , last = [], 0
166+ for match in _INLINE_CODE_RE .finditer (text ):
167+ # fix headings in the plain-text slice, then escape angles (keeping <hr>)
168+ chunk = text [last :match .start ()]
169+ chunk = _HEADING_SPACE_RE .sub (r"\1 " , chunk )
170+ out .append (_escape_preserving_hr_only (chunk ))
171+ out .append (match .group (0 )) # keep inline code as-is
172+ last = match .end ()
173+ # tail
174+ chunk = text [last :]
175+ chunk = _HEADING_SPACE_RE .sub (r"\1 " , chunk )
176+ out .append (_escape_preserving_hr_only (chunk ))
177+ return "" .join (out )
178+
179+ def escape_angle_brackets (markdown : str ) -> str :
180+ """Escape < and > everywhere except inside fenced/inline code; keep only <hr>."""
181+ # Protect existing entities so we don't double-escape them
182+ LT , GT = "\x00 LT\x00 " , "\x00 GT\x00 "
183+ markdown = markdown .replace ("<" , LT ).replace (">" , GT )
184+
185+ out , last = [], 0
186+ for match in _FENCE_RE .finditer (markdown ):
187+ out .append (_escape_outside_inline_code (markdown [last :match .start ()])) # non-fenced
188+ out .append (match .group (0 )) # keep fenced code intact
189+ last = match .end ()
190+ out .append (_escape_outside_inline_code (markdown [last :]))
191+
192+ result = "" .join (out )
193+ return result .replace (LT , "<" ).replace (GT , ">" )
194+
117195# ============================================================================
118196# Conversion helpers
119197# ============================================================================
120198
121199def mdx_heading (entry : Dict [str , Any ]) -> str :
122- """Create a level‑ 2 MDX heading from an API entry."""
200+ """Create a level- 2 MDX heading from an API entry."""
123201 date_str = datetime .strptime (entry ["buildDate" ], API_DATE_FMT ).strftime ("%Y/%m/%d" )
124202 return f"## { entry ['version' ]} - { date_str } \n \n "
125203
126-
127204def mdx_block (entry : Dict [str , Any ]) -> str :
128205 """Full MDX chunk for a single changelog entry (heading + body)."""
129- return mdx_heading (entry ) + entry ["changelogMarkdown" ]
206+ safe_body = escape_angle_brackets (entry ["changelogMarkdown" ])
207+ return mdx_heading (entry ) + safe_body + "\n \n "
130208
131209# ============================================================================
132210# Filesystem helpers
133211# ============================================================================
134212
135213def output_path_for (branch : str , is_primary : bool ) -> Path :
136- """Return where the *whats‑ new.mdx* for *branch* should live."""
214+ """Return where the *whats- new.mdx* for *branch* should live."""
137215 # We only need major.minor for the directory name – e.g. "6.2.1" → "6.2"
138216 major_minor = "." .join (branch .split ("." )[:2 ])
139217
@@ -142,7 +220,6 @@ def output_path_for(branch: str, is_primary: bool) -> Path:
142220
143221 return PROJECT_ROOT / "versioned_docs" / f"version-{ major_minor } " / "whats-new.mdx"
144222
145-
146223def write_whats_new_file (destination : Path , entries : List [Dict [str , Any ]]) -> None :
147224 """Write an MDX file sorted by *buildDate* (newest first)."""
148225 destination .parent .mkdir (parents = True , exist_ok = True )
@@ -156,7 +233,7 @@ def write_whats_new_file(destination: Path, entries: List[Dict[str, Any]]) -> No
156233 destination .write_text (FRONT_MATTER + body , encoding = "utf-8" )
157234
158235# ============================================================================
159- # Command‑ line interface
236+ # Command- line interface
160237# ============================================================================
161238
162239def main () -> None :
@@ -168,14 +245,23 @@ def main() -> None:
168245 requested_branches = sys .argv [1 :]
169246
170247 for branch in requested_branches :
248+ # reset log for this branch
249+ _ESCAPED_TAG_EVENTS .clear ()
250+
171251 is_primary = branch == primary_branch
172252 changelog_entries = fetch_branch_entries (branch )
173253 target_file = output_path_for (branch , is_primary )
174254 write_whats_new_file (target_file , changelog_entries )
255+
175256 print (f"✅ Wrote { target_file .relative_to (PROJECT_ROOT )} " )
176257
177- print ("🏁 Finished." )
258+ if _ESCAPED_TAG_EVENTS :
259+ counts = Counter (_ESCAPED_TAG_EVENTS )
260+ # print a concise per-branch summary for safe escapes
261+ summary = ", " .join (f"{ tag } ×{ n } " for tag , n in counts .most_common ())
262+ print (f" • Escaped non-whitelisted tag-like snippets: { summary } " )
178263
264+ print ("🏁 Finished." )
179265
180266if __name__ == "__main__" :
181- main ()
267+ main ()
0 commit comments