Skip to content

Commit 3ef37e1

Browse files
chrisradekChristopher Radek
andauthored
[website] generate llms.txt and llms-full.txt files (#8345)
An attempt to generate llms.txt - #8162 Info on llms.txt: https://llmstxt.org/ This PR adds the following: - `llms.txt` in several locations - `llms-full.txt` in several locations - `llms.json` in docs root - this is for easier consumption of llms.txt from our MCP tool as we won't have to parse markdown to get links - generate markdown files of all `docs`. Excludes blogs and release notes - Updates `tspd` to add some llmstxt frontmatter This PR introduces `llms.txt` at a few different entry points: - `https://typespec.io/docs/llms.txt` - `https://typespec.io/docs/language-basics/llms.txt` - `https://typespec.io/docs/standard-library/llms.txt` - `https://typespec.io/docs/libraries/[name]/llms.txt` Each one of these files contains links to markdown documents in descendant paths. The root version will contain the same links as all the other llms.txt files. `llms-full.txt` is an unofficial but common practice that is meant to be the concatenation of the docs `llms.txt` points to. I'm generating these for possible use in our own MCP server (to be served up like typespec-mcp's `learn` tool) - but I am concerned with how many tokens these files take up. My hand-written versions for TypeSpec Azure were much smaller (about 1/10th the size) - so I think these will need to be tuned further. One other potential thing to change in this PR - make `site` configurable. For `llms.txt`, the links should be full URLs. This means PR checks/localhost will still cause the embedded links to point to `https://typespec.io`. Can add a new build step to `website` for dev to point to `localhost`, but not sure how to make it work with the PR checks yet. --------- Co-authored-by: Christopher Radek <[email protected]>
1 parent 0654f27 commit 3ef37e1

File tree

90 files changed

+769
-20
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

90 files changed

+769
-20
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
changeKind: feature
3+
packages:
4+
- "@typespec/tspd"
5+
---
6+
7+
Adds `llmstxt` frontmatter to generated reference docs to enable inclusion in llms.txt. Opt-in: specify `--llmstxt` to enable
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
---
2+
changeKind: internal
3+
packages:
4+
- "@typespec/events"
5+
- "@typespec/http"
6+
- "@typespec/json-schema"
7+
- "@typespec/openapi"
8+
- "@typespec/openapi3"
9+
- "@typespec/protobuf"
10+
- "@typespec/rest"
11+
- "@typespec/streams"
12+
- "@typespec/versioning"
13+
---
14+
15+
Updated doc generation to generate `llmstxt` frontmatter

cspell.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ words:
129129
- LINUXOS
130130
- LINUXVMIMAGE
131131
- ljust
132+
- llms
133+
- llmstxt
132134
- lmazuel
133135
- lropaging
134136
- lstrip

packages/astro-utils/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
"./components/*": "./src/components/*",
1111
"./utils/*": "./src/utils/*.ts",
1212
"./css/*": "./src/css/*",
13-
"./expressive-code/*": "./dist/expressive-code/*.js"
13+
"./expressive-code/*": "./dist/expressive-code/*.js",
14+
"./llmstxt": "./src/llmstxt/index.ts",
15+
"./llmstxt/schema": "./src/llmstxt/schema.ts"
1416
},
1517
"files": [
1618
"src",
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import { mergeSiteWithPath, type DocEntry, type LlmsTxtAsJson } from "./index";
2+
3+
/**
4+
* Generates the Markdown path following the llms.tst specification.
5+
* @param docId The document ID from Astro content collections.
6+
*/
7+
export function generateMarkdownPath(docId: string): string {
8+
// If the final path fragment does not include a file extension, use `index.html.md`
9+
if (docId.endsWith("/")) {
10+
return `${docId}index.html.md`;
11+
}
12+
13+
const finalPathFragment = docId.split("/").pop() ?? "";
14+
if (!finalPathFragment.includes(".")) {
15+
return `${docId}/index.html.md`;
16+
}
17+
18+
return `${docId}.md`;
19+
}
20+
21+
/**
22+
* Generates the LLMs text following the llms.tst specification.
23+
* @param llmsData The pre-processed LLMs JSON data.
24+
* @see `processDocsForLlmsTxt`
25+
*/
26+
export function generateLlmstxt(llmsData: LlmsTxtAsJson): string {
27+
const contents: string[] = [];
28+
contents.push(`# ${llmsData.title}`);
29+
contents.push(`> ${llmsData.description}`);
30+
31+
for (const [name, topic] of Object.entries(llmsData.topics)) {
32+
if (!topic.length) continue;
33+
const section: string[] = [];
34+
section.push(`## ${name}\n`);
35+
for (const { title, url, description } of topic) {
36+
section.push(`- [${title}](${url}): ${description}`);
37+
}
38+
contents.push(section.join("\n"));
39+
}
40+
return contents.join("\n\n");
41+
}
42+
43+
/**
44+
* Generates the full LLMs text - the combined markdown documentation referenced from an llms.txt.
45+
* @param title The title for the LLMs text file.
46+
* @param docs The collection of documentation entries to include.
47+
*/
48+
export function generateLlmstxtFull(title: string, docs: DocEntry[]): string {
49+
const contents: string[] = [];
50+
contents.push(`# ${title}`);
51+
52+
for (const doc of docs) {
53+
if (!doc.body) continue;
54+
55+
const docTitle = doc.data.title;
56+
const docDescription = doc.data.description ?? "";
57+
contents.push(`# ${docTitle}`);
58+
if (docDescription) contents.push(docDescription);
59+
contents.push(doc.body);
60+
}
61+
62+
return contents.join("\n\n");
63+
}
64+
65+
export type GenerateLlmsJsonTopicDetails = {
66+
id: string;
67+
description: string;
68+
pathPrefix: string;
69+
};
70+
71+
export type LlmsJson = {
72+
topic: string;
73+
description: string;
74+
contentUrl: string;
75+
}[];
76+
77+
/**
78+
* Generates the `llms.json` version of `llms.txt`.
79+
* This is meant for easier consumption by our tools.
80+
*/
81+
export function generateLlmsJson(
82+
topicDetails: GenerateLlmsJsonTopicDetails[],
83+
siteHref: string,
84+
): LlmsJson {
85+
return topicDetails.map(({ id, description, pathPrefix }) => ({
86+
topic: id,
87+
description,
88+
contentUrl: mergeSiteWithPath(siteHref, pathPrefix, "llms-full.txt"),
89+
}));
90+
}
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import type { z } from "astro:content";
2+
import { generateMarkdownPath } from "./generators";
3+
import type { llmstxtSchema } from "./schema";
4+
5+
export * from "./generators";
6+
export * from "./routes";
7+
export * from "./topics";
8+
export interface DocEntry {
9+
id: string;
10+
data: {
11+
title: string;
12+
description?: string;
13+
llmstxt?: z.infer<typeof llmstxtSchema>;
14+
};
15+
body?: string;
16+
}
17+
18+
export interface LlmsTxtAsJson {
19+
title: string;
20+
description: string;
21+
topics: Record<
22+
string,
23+
{
24+
title: string;
25+
description?: string;
26+
url: string;
27+
}[]
28+
>;
29+
}
30+
31+
export interface ProcessDocsProps {
32+
/**
33+
* Title for the LLMs text file.
34+
*/
35+
title: string;
36+
/**
37+
* Description for the LLMs text file.
38+
*/
39+
description: string;
40+
/**
41+
* The site URL, used to generate full URLs for documentation entries.
42+
*/
43+
site?: URL;
44+
/**
45+
* The collection of documentation entries to process.
46+
* Each entry must include a valid LLMs text schema.
47+
* See `import("astro:content").getCollection`
48+
*/
49+
docs: DocEntry[];
50+
/**
51+
* Name of the llmstxt section and the pathPrefix to match against doc IDs.
52+
* If a doc matches multiple pathPrefixes, it will be assigned to the first matching section.
53+
*/
54+
llmsSections: { name: string; pathPrefix: string }[];
55+
}
56+
57+
/**
58+
* Processes astro content collection docs and metadata for easy `llms.txt` generation.
59+
*/
60+
export async function processDocsForLlmsTxt({
61+
title,
62+
description,
63+
site,
64+
docs,
65+
llmsSections,
66+
}: ProcessDocsProps) {
67+
const sections = organizeDocsIntoSections(docs, llmsSections);
68+
const result: LlmsTxtAsJson = { title, description, topics: {} };
69+
70+
const siteHref = site?.href ?? "";
71+
for (const [sectionName, sectionDocs] of Object.entries(sections)) {
72+
if (sectionDocs.length === 0) continue;
73+
74+
const topic = sectionName;
75+
const topics = sectionDocs.map((doc) => {
76+
const title = doc.data.title;
77+
const desc = doc.data.description ?? "";
78+
const path = generateMarkdownPath(doc.id);
79+
const url = mergeSiteWithPath(siteHref, path);
80+
return { title, description: desc, url };
81+
});
82+
83+
result.topics[topic] = topics;
84+
}
85+
86+
return result;
87+
}
88+
89+
function organizeDocsIntoSections(
90+
docs: DocEntry[],
91+
llmsSections: ProcessDocsProps["llmsSections"],
92+
) {
93+
docs.sort((a, b) => (a.id > b.id ? 1 : -1));
94+
const seenDocs = new Set<DocEntry>();
95+
const sections: Record<string, DocEntry[]> = {};
96+
97+
for (const { name, pathPrefix } of llmsSections) {
98+
sections[name] = docs.filter((doc) => {
99+
if (seenDocs.has(doc)) return false;
100+
if (doc.id.startsWith(pathPrefix)) {
101+
seenDocs.add(doc);
102+
return true;
103+
}
104+
return false;
105+
});
106+
}
107+
108+
return sections;
109+
}
110+
111+
/**
112+
* Merges a site URL with path parts.
113+
* Used when needing to create full URLs when working with astro content collections.
114+
* @param siteHref The base URL of the site.
115+
* @param pathParts The path parts to merge with the site URL.
116+
* @returns The merged URL.
117+
*/
118+
export function mergeSiteWithPath(siteHref: string, ...pathParts: string[]): string {
119+
let result = siteHref;
120+
121+
for (const part of pathParts) {
122+
if (!part) continue; // Skip empty parts
123+
124+
const resultTrailingSlash = result.endsWith("/");
125+
const partLeadingSlash = part.startsWith("/");
126+
127+
if (resultTrailingSlash && partLeadingSlash) {
128+
result = `${result}${part.slice(1)}`;
129+
} else if (!resultTrailingSlash && !partLeadingSlash) {
130+
result = `${result}/${part}`;
131+
} else {
132+
result = `${result}${part}`;
133+
}
134+
}
135+
136+
return result;
137+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import type { APIRoute } from "astro";
2+
import {
3+
generateLlmstxt,
4+
generateLlmstxtFull,
5+
processDocsForLlmsTxt,
6+
type DocEntry,
7+
type TopicProps,
8+
} from "./index";
9+
10+
export type RouteParams = { path: string; llms_type: "llms" | "llms-full" };
11+
export type RouteProps = Pick<Required<TopicProps>, "title" | "description" | "docs">;
12+
13+
export const spreadLlmsTxtRoute: APIRoute<RouteProps, RouteParams> = async ({
14+
props,
15+
params,
16+
site,
17+
}) => {
18+
const { title, docs, description } = props;
19+
const { llms_type } = params;
20+
21+
if (llms_type === "llms") {
22+
const llmsData = await processDocsForLlmsTxt({
23+
title,
24+
description,
25+
docs,
26+
// Use blank pathPrefix to include all docs in the llms.txt
27+
llmsSections: [{ name: "Docs", pathPrefix: "" }],
28+
site,
29+
});
30+
31+
const llmstxt = generateLlmstxt(llmsData);
32+
return new Response(llmstxt, {
33+
headers: {
34+
"Content-Type": "text/markdown; charset=utf-8",
35+
},
36+
});
37+
} else {
38+
const llmstxt = generateLlmstxtFull(description, docs);
39+
return new Response(llmstxt, {
40+
headers: {
41+
"Content-Type": "text/markdown; charset=utf-8",
42+
},
43+
});
44+
}
45+
};
46+
47+
export const markdownRoute: APIRoute<{ doc: DocEntry }> = async ({ props }) => {
48+
const { doc } = props;
49+
return new Response(doc.body ?? "", {
50+
headers: {
51+
"Content-Type": "text/markdown; charset=utf-8",
52+
},
53+
});
54+
};
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import { z } from "astro:content";
2+
3+
export const llmstxtSchema = z.boolean().optional();
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import type { DocEntry } from ".";
2+
3+
export interface TopicProps {
4+
title: string;
5+
description: string;
6+
pathPrefix: string;
7+
docs: DocEntry[];
8+
id: string;
9+
}
10+
11+
/**
12+
*
13+
*/
14+
export function populateTopicDocs(
15+
topics: Omit<TopicProps, "docs">[],
16+
docs: DocEntry[],
17+
): TopicProps[] {
18+
docs.sort((a, b) => (a.id > b.id ? 1 : -1));
19+
const seenDocs = new Set<DocEntry>();
20+
21+
return topics.map((topic) => {
22+
return {
23+
...topic,
24+
docs: docs.filter((doc) => {
25+
if (seenDocs.has(doc)) return false;
26+
if (doc.id.startsWith(topic.pathPrefix)) {
27+
seenDocs.add(doc);
28+
return true;
29+
}
30+
return false;
31+
}),
32+
};
33+
});
34+
}

packages/events/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
"test-official": "vitest run --coverage --reporter=junit --reporter=default --no-file-parallelism",
4646
"lint": "eslint . --ext .ts --max-warnings=0",
4747
"lint:fix": "eslint . --fix --ext .ts",
48-
"regen-docs": "tspd doc . --enable-experimental --output-dir ../../website/src/content/docs/docs/libraries/events/reference"
48+
"regen-docs": "tspd doc . --enable-experimental --llmstxt --output-dir ../../website/src/content/docs/docs/libraries/events/reference"
4949
},
5050
"files": [
5151
"lib/*.tsp",

0 commit comments

Comments
 (0)