Skip to content

Commit 15659a4

Browse files
authored
Merge pull request #2115 from poissoncorp/RDoc-3472
RDoc-3472 Fix What's New building on TC
2 parents 2b6f5b2 + 9ac1ac5 commit 15659a4

File tree

1 file changed

+107
-21
lines changed

1 file changed

+107
-21
lines changed

scripts/build_whats_new.py

Lines changed: 107 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
#!/usr/bin/env python3
2-
"""build_whats_new.py – regenerate RavenDB *What's New* pages
3-
4-
The script lives in the project's **/scripts** folder, therefore the Docusaurus
5-
root is assumed to be its parent directory (``../``).
2+
"""build_whats_new.py – regenerate RavenDB *What's New* pages
63
74
What the script does
85
--------------------
96
1. **Downloads** changelog entries for one or more RavenDB branches via the
107
public Documentation API.
11-
2. **Converts** each entry's HTML body to Markdown using *markdownify*.
12-
3. **Sorts** the entries strictly by their ``buildDate`` field (newest → oldest).
13-
4. **Writes** them to ``whats-new.mdx`` files with front‑matter already in place.
8+
2. **Sorts** the entries strictly by their ``buildDate`` field (newest → oldest).
9+
3. **Writes** them to ``whats-new.mdx`` files with front-matter already in place.
10+
4. **Escapes** raw angle brackets outside code (inline/fenced), preserving only
11+
`<hr />` (and `<hr>` / `<hr/>`) as real HTML; everything else is escaped.
12+
Existing `&lt;` / `&gt;` aren’t double-escaped.
13+
Also logs any tag-like snippets that were escaped.
1414
1515
File locations
1616
--------------
@@ -25,7 +25,7 @@
2525
2626
Environment variable
2727
--------------------
28-
Set your RavenDB docs API key in ``API_WEB_RAVENDB_NET_HOST`` before running.
28+
Set API endpoint in ``WHATS_NEW_URL`` before running.
2929
3030
Examples
3131
--------
@@ -38,9 +38,9 @@
3838

3939
from __future__ import annotations
4040

41-
import os
4241
import re
4342
import sys
43+
from collections import Counter
4444
from datetime import datetime
4545
from pathlib import Path
4646
from typing import List, Dict, Any
@@ -59,7 +59,7 @@
5959
SCRIPT_DIR = Path(__file__).resolve().parent
6060
PROJECT_ROOT = SCRIPT_DIR.parent # «../» relative to /scripts
6161

62-
# Docusaurus frontmatter block that prefixes every generated MDX file
62+
# Docusaurus front-matter block that prefixes every generated MDX file
6363
FRONT_MATTER = (
6464
"---\n"
6565
'title: "What\'s New"\n'
@@ -81,9 +81,7 @@ def get_api_page(branch: str, page: int = 1) -> Dict[str, Any]:
8181
"""Return a single paginated payload from the Documentation API."""
8282
response = requests.get(
8383
API_BASE_URL,
84-
headers={
85-
"Accept": "application/json",
86-
},
84+
headers={"Accept": "application/json"},
8785
params={"version": branch, "page": page},
8886
timeout=20,
8987
)
@@ -114,26 +112,106 @@ def fetch_branch_entries(branch: str) -> List[Dict[str, Any]]:
114112

115113
return entries
116114

115+
# ============================================================================
116+
# Escaping helpers (whitelist only <hr>, log tag-like escapes)
117+
# ============================================================================
118+
119+
# fenced code blocks (``` or ~~~), with optional info string
120+
_FENCE_RE = re.compile(r"(^|\n)(?P<fence>```|~~~)[^\n]*\n.*?\n(?P=fence)(?=\n|$)", re.DOTALL)
121+
# inline code spans
122+
_INLINE_CODE_RE = re.compile(r"`[^`]*`")
123+
# tag-like matcher; allows attributes, self-closing, etc.
124+
_HTML_TAG_RE = re.compile(r"</?\s*([A-Za-z][A-Za-z0-9:-]*)\b(?:\s+[^<>]*?)?/?>")
125+
# '###Server' -> '### Server'
126+
_HEADING_SPACE_RE = re.compile(r"(?m)^(#{1,6})(?!\s|#)")
127+
# whitelist: keep only <hr>, <hr/>, <hr /> (case-insensitive)
128+
_WHITELIST_TAGS = {"hr", "code"}
129+
130+
# per-run log of escaped tag-like snippets
131+
_ESCAPED_TAG_EVENTS: list[str] = []
132+
133+
def _log_tag_escape(snippet: str) -> None:
134+
# keep the literal snippet for reporting
135+
_ESCAPED_TAG_EVENTS.append(snippet)
136+
137+
def _escape_angles(text: str) -> str:
138+
return text.replace("<", "&lt;").replace(">", "&gt;")
139+
140+
def _escape_preserving_hr_only(text: str) -> str:
141+
"""Escape < and > in plain text, but keep only `<hr>` variants as HTML.
142+
Any other tag-like snippet (e.g., <T>, <div>, <Foo>) is escaped & logged.
143+
"""
144+
out, last = [], 0
145+
for match in _HTML_TAG_RE.finditer(text):
146+
# escape plain text before the tag-like match
147+
out.append(_escape_angles(text[last:match.start()]))
148+
149+
tag_full = match.group(0) # matched groups from regex
150+
tag_name = match.group(1).lower() if match.group(1) else "" # to check if it isn't whitelisted e.g. <hr>
151+
152+
if tag_name in _WHITELIST_TAGS:
153+
out.append(tag_full) # keep <hr> / <hr/> / <hr />
154+
else:
155+
_log_tag_escape(tag_full)
156+
out.append(_escape_angles(tag_full)) # escape non-whitelisted tag-like text
157+
158+
last = match.end()
159+
160+
out.append(_escape_angles(text[last:]))
161+
return "".join(out)
162+
163+
def _escape_outside_inline_code(text: str) -> str:
164+
"""Within non-fenced areas, escape outside inline code spans."""
165+
out, last = [], 0
166+
for match in _INLINE_CODE_RE.finditer(text):
167+
# fix headings in the plain-text slice, then escape angles (keeping <hr>)
168+
chunk = text[last:match.start()]
169+
chunk = _HEADING_SPACE_RE.sub(r"\1 ", chunk)
170+
out.append(_escape_preserving_hr_only(chunk))
171+
out.append(match.group(0)) # keep inline code as-is
172+
last = match.end()
173+
# tail
174+
chunk = text[last:]
175+
chunk = _HEADING_SPACE_RE.sub(r"\1 ", chunk)
176+
out.append(_escape_preserving_hr_only(chunk))
177+
return "".join(out)
178+
179+
def escape_angle_brackets(markdown: str) -> str:
180+
"""Escape < and > everywhere except inside fenced/inline code; keep only <hr>."""
181+
# Protect existing entities so we don't double-escape them
182+
LT, GT = "\x00LT\x00", "\x00GT\x00"
183+
markdown = markdown.replace("&lt;", LT).replace("&gt;", GT)
184+
185+
out, last = [], 0
186+
for match in _FENCE_RE.finditer(markdown):
187+
out.append(_escape_outside_inline_code(markdown[last:match.start()])) # non-fenced
188+
out.append(match.group(0)) # keep fenced code intact
189+
last = match.end()
190+
out.append(_escape_outside_inline_code(markdown[last:]))
191+
192+
result = "".join(out)
193+
return result.replace(LT, "&lt;").replace(GT, "&gt;")
194+
117195
# ============================================================================
118196
# Conversion helpers
119197
# ============================================================================
120198

121199
def mdx_heading(entry: Dict[str, Any]) -> str:
122-
"""Create a level2 MDX heading from an API entry."""
200+
"""Create a level-2 MDX heading from an API entry."""
123201
date_str = datetime.strptime(entry["buildDate"], API_DATE_FMT).strftime("%Y/%m/%d")
124202
return f"## {entry['version']} - {date_str}\n\n"
125203

126-
127204
def mdx_block(entry: Dict[str, Any]) -> str:
128205
"""Full MDX chunk for a single changelog entry (heading + body)."""
129-
return mdx_heading(entry) + entry["changelogMarkdown"]
206+
safe_body = escape_angle_brackets(entry["changelogMarkdown"])
207+
return mdx_heading(entry) + safe_body + "\n\n"
130208

131209
# ============================================================================
132210
# Filesystem helpers
133211
# ============================================================================
134212

135213
def output_path_for(branch: str, is_primary: bool) -> Path:
136-
"""Return where the *whatsnew.mdx* for *branch* should live."""
214+
"""Return where the *whats-new.mdx* for *branch* should live."""
137215
# We only need major.minor for the directory name – e.g. "6.2.1" → "6.2"
138216
major_minor = ".".join(branch.split(".")[:2])
139217

@@ -142,7 +220,6 @@ def output_path_for(branch: str, is_primary: bool) -> Path:
142220

143221
return PROJECT_ROOT / "versioned_docs" / f"version-{major_minor}" / "whats-new.mdx"
144222

145-
146223
def write_whats_new_file(destination: Path, entries: List[Dict[str, Any]]) -> None:
147224
"""Write an MDX file sorted by *buildDate* (newest first)."""
148225
destination.parent.mkdir(parents=True, exist_ok=True)
@@ -156,7 +233,7 @@ def write_whats_new_file(destination: Path, entries: List[Dict[str, Any]]) -> No
156233
destination.write_text(FRONT_MATTER + body, encoding="utf-8")
157234

158235
# ============================================================================
159-
# Commandline interface
236+
# Command-line interface
160237
# ============================================================================
161238

162239
def main() -> None:
@@ -168,14 +245,23 @@ def main() -> None:
168245
requested_branches = sys.argv[1:]
169246

170247
for branch in requested_branches:
248+
# reset log for this branch
249+
_ESCAPED_TAG_EVENTS.clear()
250+
171251
is_primary = branch == primary_branch
172252
changelog_entries = fetch_branch_entries(branch)
173253
target_file = output_path_for(branch, is_primary)
174254
write_whats_new_file(target_file, changelog_entries)
255+
175256
print(f"✅ Wrote {target_file.relative_to(PROJECT_ROOT)}")
176257

177-
print("🏁 Finished.")
258+
if _ESCAPED_TAG_EVENTS:
259+
counts = Counter(_ESCAPED_TAG_EVENTS)
260+
# print a concise per-branch summary for safe escapes
261+
summary = ", ".join(f"{tag}×{n}" for tag, n in counts.most_common())
262+
print(f" • Escaped non-whitelisted tag-like snippets: {summary}")
178263

264+
print("🏁 Finished.")
179265

180266
if __name__ == "__main__":
181-
main()
267+
main()

0 commit comments

Comments
 (0)