Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/strip-html-from-docs.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@smithy/service-client-documentation-generator": minor
---

Strip HTML tags from Smithy documentation traits during TypeScript codegen, producing clean plaintext JSDoc comments instead of raw HTML. This improves hover-doc readability in editors like VS Code and Neovim.
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package software.amazon.smithy.typescript.codegen;

import java.util.regex.Pattern;
import software.amazon.smithy.utils.SmithyUnstableApi;

/**
* Converts HTML documentation strings from Smithy model {@code @documentation}
* traits into plain-text suitable for JSDoc comments.
*
* <p>The Smithy documentation trait values often contain HTML markup (e.g.
* {@code <p>}, {@code <a>}, {@code <code>}, {@code <ul>/<li>}). This class
* strips that markup while preserving readable formatting so that IDE hover
* docs are clean and legible.
*/
@SmithyUnstableApi
final class DocumentationConverter {

// Block-level elements that should produce paragraph breaks.
private static final Pattern BLOCK_BREAK = Pattern.compile(
"<\\s*/?(p|br|h[1-6]|div|section|article|header|footer|nav|aside|main|blockquote|pre|hr|table|thead|tbody|tfoot|tr)\\b[^>]*/?>",
Pattern.CASE_INSENSITIVE
);

// List items get a leading dash for readability.
private static final Pattern LIST_ITEM_OPEN = Pattern.compile(
"<\\s*li\\b[^>]*>",
Pattern.CASE_INSENSITIVE
);

// Closing list item tags.
private static final Pattern LIST_ITEM_CLOSE = Pattern.compile(
"<\\s*/li\\s*>",
Pattern.CASE_INSENSITIVE
);

// <ul>, <ol>, <dl> open/close tags — just remove them.
private static final Pattern LIST_WRAPPER = Pattern.compile(
"<\\s*/?(ul|ol|dl)\\b[^>]*>",
Pattern.CASE_INSENSITIVE
);

// <dt> becomes a newline + bold-ish label, <dd> becomes indented.
private static final Pattern DT_TAG = Pattern.compile(
"<\\s*dt\\b[^>]*>",
Pattern.CASE_INSENSITIVE
);
private static final Pattern DT_CLOSE = Pattern.compile(
"<\\s*/dt\\s*>",
Pattern.CASE_INSENSITIVE
);
private static final Pattern DD_TAG = Pattern.compile(
"<\\s*dd\\b[^>]*>",
Pattern.CASE_INSENSITIVE
);
private static final Pattern DD_CLOSE = Pattern.compile(
"<\\s*/dd\\s*>",
Pattern.CASE_INSENSITIVE
);

// <code> and <pre> content is wrapped in backticks.
private static final Pattern CODE_OPEN = Pattern.compile(
"<\\s*(code|pre)\\b[^>]*>",
Pattern.CASE_INSENSITIVE
);
private static final Pattern CODE_CLOSE = Pattern.compile(
"<\\s*/(code|pre)\\s*>",
Pattern.CASE_INSENSITIVE
);

// <b>, <strong>, <i>, <em> — just strip them (no markdown equivalent in JSDoc).
private static final Pattern INLINE_FORMAT = Pattern.compile(
"<\\s*/?(b|strong|i|em|u|s|strike|del|ins|sub|sup|small|big|span|font|mark|abbr|cite|dfn|kbd|samp|var|wbr)\\b[^>]*>",
Pattern.CASE_INSENSITIVE
);

// Anchor tags: extract the link text, drop the URL.
private static final Pattern ANCHOR = Pattern.compile(
"<\\s*a\\b[^>]*>(.*?)<\\s*/a\\s*>",
Pattern.CASE_INSENSITIVE | Pattern.DOTALL
);

// <th> and <td> — separate cells with a tab-like space.
private static final Pattern TABLE_CELL = Pattern.compile(
"<\\s*/?(th|td)\\b[^>]*>",
Pattern.CASE_INSENSITIVE
);

// Any remaining HTML tags.
private static final Pattern ANY_TAG = Pattern.compile("<[^>]+>");

// HTML entities.
private static final Pattern ENTITY_AMP = Pattern.compile("&amp;", Pattern.CASE_INSENSITIVE);
private static final Pattern ENTITY_LT = Pattern.compile("&lt;", Pattern.CASE_INSENSITIVE);
private static final Pattern ENTITY_GT = Pattern.compile("&gt;", Pattern.CASE_INSENSITIVE);
private static final Pattern ENTITY_QUOT = Pattern.compile("&quot;", Pattern.CASE_INSENSITIVE);
private static final Pattern ENTITY_APOS = Pattern.compile("&#39;|&apos;", Pattern.CASE_INSENSITIVE);
private static final Pattern ENTITY_NBSP = Pattern.compile("&nbsp;", Pattern.CASE_INSENSITIVE);
private static final Pattern ENTITY_NUMERIC = Pattern.compile("&#(\\d+);");
private static final Pattern ENTITY_HEX = Pattern.compile("&#x([0-9a-fA-F]+);");

// Collapse runs of blank lines into at most two newlines (one blank line).
private static final Pattern EXCESS_NEWLINES = Pattern.compile("\\n{3,}");
// Collapse runs of spaces/tabs on a single line.
private static final Pattern EXCESS_SPACES = Pattern.compile("[ \\t]{2,}");
// Trailing whitespace on each line.
private static final Pattern TRAILING_WS = Pattern.compile("[ \\t]+$", Pattern.MULTILINE);

private DocumentationConverter() {}

/**
* Converts an HTML documentation string to plain text suitable for JSDoc.
*
* @param html the raw HTML documentation value from a Smithy model
* @return a plain-text version with HTML tags removed and basic formatting preserved
*/
static String htmlToPlainText(String html) {
if (html == null || html.isEmpty()) {
return html;
}

String s = html;

// Anchors — keep link text only.
s = ANCHOR.matcher(s).replaceAll("$1");

// <code>/<pre> → backtick-wrapped.
s = CODE_OPEN.matcher(s).replaceAll("`");
s = CODE_CLOSE.matcher(s).replaceAll("`");

// List items → newline + dash.
s = LIST_ITEM_OPEN.matcher(s).replaceAll("\n - ");
s = LIST_ITEM_CLOSE.matcher(s).replaceAll("");

// Definition list elements.
s = DT_TAG.matcher(s).replaceAll("\n");
s = DT_CLOSE.matcher(s).replaceAll(" - ");
s = DD_TAG.matcher(s).replaceAll(" ");
s = DD_CLOSE.matcher(s).replaceAll("");

// List wrappers.
s = LIST_WRAPPER.matcher(s).replaceAll("\n");

// Table cells — add spacing.
s = TABLE_CELL.matcher(s).replaceAll(" ");

// Block-level elements → paragraph break.
s = BLOCK_BREAK.matcher(s).replaceAll("\n\n");

// Inline formatting tags — just remove.
s = INLINE_FORMAT.matcher(s).replaceAll("");

// Any remaining tags.
s = ANY_TAG.matcher(s).replaceAll("");

// Decode HTML entities.
s = ENTITY_NBSP.matcher(s).replaceAll(" ");
s = ENTITY_LT.matcher(s).replaceAll("<");
s = ENTITY_GT.matcher(s).replaceAll(">");
s = ENTITY_QUOT.matcher(s).replaceAll("\"");
s = ENTITY_APOS.matcher(s).replaceAll("'");
s = ENTITY_HEX.matcher(s).replaceAll(mr -> {
int codePoint = Integer.parseInt(mr.group(1), 16);
return String.valueOf((char) codePoint);
});
s = ENTITY_NUMERIC.matcher(s).replaceAll(mr -> {
int codePoint = Integer.parseInt(mr.group(1));
return String.valueOf((char) codePoint);
});
// &amp; must be last to avoid double-decoding.
s = ENTITY_AMP.matcher(s).replaceAll("&");

// Normalize whitespace.
s = TRAILING_WS.matcher(s).replaceAll("");
s = EXCESS_SPACES.matcher(s).replaceAll(" ");
s = EXCESS_NEWLINES.matcher(s).replaceAll("\n\n");

return s.trim();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ boolean writeShapeDocs(Shape shape, UnaryOperator<String> preprocessor) {
if (hasDocumentation || hasDeprecation) {
String docs =
hasDocumentation ? shape.getTrait(DocumentationTrait.class).get().getValue() : "";
docs = DocumentationConverter.htmlToPlainText(docs);
docs = docs.replace("{", "\\{").replace("}", "\\}");

if (hasDeprecation) {
Expand Down Expand Up @@ -330,6 +331,7 @@ boolean writeMemberDocs(Model model, MemberShape member) {
if (hasDocumentation || hasDeprecation) {
String docs =
hasDocumentation ? member.getMemberTrait(model, DocumentationTrait.class).get().getValue() : "";
docs = DocumentationConverter.htmlToPlainText(docs);
docs = docs.replace("{", "\\{").replace("}", "\\}");

if (hasDeprecation) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0
*/
package software.amazon.smithy.typescript.codegen;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import org.junit.jupiter.api.Test;

public class DocumentationConverterTest {

@Test
public void returnsNullForNull() {
assertNull(DocumentationConverter.htmlToPlainText(null));
}

@Test
public void returnsEmptyForEmpty() {
assertEquals("", DocumentationConverter.htmlToPlainText(""));
}

@Test
public void passesPlainTextThrough() {
assertEquals("Hello world.", DocumentationConverter.htmlToPlainText("Hello world."));
}

@Test
public void stripsParagraphTags() {
String html = "<p>First paragraph.</p><p>Second paragraph.</p>";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("First paragraph.\n\nSecond paragraph.", result);
}

@Test
public void stripsAnchorTagsKeepsText() {
String html = "See <a href=\"https://example.com\">the docs</a> for details.";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("See the docs for details.", result);
}

@Test
public void convertsCodeTagsToBackticks() {
String html = "Use the <code>FooClient</code> class.";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("Use the `FooClient` class.", result);
}

@Test
public void convertsUnorderedList() {
String html = "<p>Options:</p><ul><li>Option A</li><li>Option B</li></ul>";
String result = DocumentationConverter.htmlToPlainText(html);
// Should have dash-prefixed items.
assertTrue(result.startsWith("Options:"), "Should start with 'Options:', got: " + result);
assertTrue(result.contains("- Option A"), "Should contain dash-prefixed Option A, got: " + result);
assertTrue(result.contains("- Option B"), "Should contain dash-prefixed Option B, got: " + result);
}

@Test
public void stripsInlineFormattingTags() {
String html = "This is <b>bold</b> and <i>italic</i> and <strong>strong</strong>.";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("This is bold and italic and strong.", result);
}

@Test
public void decodesHtmlEntities() {
String html = "A &amp; B &lt; C &gt; D &quot;E&quot; F&#39;s";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("A & B < C > D \"E\" F's", result);
}

@Test
public void decodesNumericEntities() {
String html = "&#169; 2024";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("\u00A9 2024", result);
}

@Test
public void decodesHexEntities() {
String html = "&#xA9; 2024";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("\u00A9 2024", result);
}

@Test
public void handlesBrTags() {
String html = "Line one.<br/>Line two.<br>Line three.";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("Line one.\n\nLine two.\n\nLine three.", result);
}

@Test
public void collapsesExcessiveWhitespace() {
String html = "<p>First.</p>\n\n\n<p>Second.</p>\n\n\n\n<p>Third.</p>";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("First.\n\nSecond.\n\nThird.", result);
}

@Test
public void handlesNestedHtml() {
String html = "<p>Use <a href=\"https://example.com\"><code>MyApi</code></a> to call the service.</p>";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("Use `MyApi` to call the service.", result);
}

@Test
public void handlesDefinitionList() {
String html = "<dl><dt>Term</dt><dd>Definition</dd></dl>";
String result = DocumentationConverter.htmlToPlainText(html);
assertTrue(result.contains("Term"), "Should contain the term");
assertTrue(result.contains("Definition"), "Should contain the definition");
assertTrue(result.contains("-"), "Should contain a separator");
}

@Test
public void handlesNbsp() {
String html = "Hello&nbsp;world";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals("Hello world", result);
}

@Test
public void handlesRealWorldEcsExample() {
// Simplified version of the ECS RegisterTaskDefinition docs from the issue.
String html = "<p>Registers a new task definition from the supplied <code>family</code> and "
+ "<code>containerDefinitions</code>. Optionally, you can add data volumes to your containers "
+ "with the <code>volumes</code> parameter. For more information about task definition parameters "
+ "and defaults, see <a href=\"https://docs.aws.amazon.com/AmazonECS/latest/developerguide/"
+ "task_defintions.html\">Amazon ECS Task Definitions</a> in the "
+ "<i>Amazon Elastic Container Service Developer Guide</i>.</p>";
String result = DocumentationConverter.htmlToPlainText(html);
assertEquals(
"Registers a new task definition from the supplied `family` and "
+ "`containerDefinitions`. Optionally, you can add data volumes to your containers "
+ "with the `volumes` parameter. For more information about task definition parameters "
+ "and defaults, see Amazon ECS Task Definitions in the "
+ "Amazon Elastic Container Service Developer Guide.",
result
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,22 @@ public void buildDramaticDeprecationAnnotation() {
String result = TypeScriptWriter.buildDeprecationAnnotation(trait);
assertEquals("@deprecated Noo!!!", result);
}

@Test
public void writeShapeDocsStripsHtmlTags() {
StringShape shape = StringShape.builder()
.id(ShapeId.from("com.example#MyString"))
.addTrait(new DocumentationTrait(
"<p>Use the <code>FooClient</code> to call <a href=\"https://example.com\">the API</a>.</p>"))
.build();

TypeScriptWriter writer = new TypeScriptWriter("foo");
writer.writeShapeDocs(shape);
String result = writer.toString();

assertThat(result, containsString("Use the `FooClient` to call the API."));
assertThat(result, not(containsString("<p>")));
assertThat(result, not(containsString("<code>")));
assertThat(result, not(containsString("<a ")));
}
}