PolicyEngine · SakshiKekre · Apr 2, 2026
diff --git a/website/src/app/llms-full.txt/route.ts b/website/src/app/llms-full.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile(), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms-recent.txt/route.ts b/website/src/app/llms-recent.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile(undefined, 50), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms-research-uk.txt/route.ts b/website/src/app/llms-research-uk.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile("uk"), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms-research-us.txt/route.ts b/website/src/app/llms-research-us.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateArticleFile } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateArticleFile("us"), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/app/llms.txt/route.ts b/website/src/app/llms.txt/route.ts
@@ -0,0 +1,7 @@
+import { generateIndex } from "@/lib/llmsTxt";
+
+export function GET() {
+  return new Response(generateIndex(), {
+    headers: { "Content-Type": "text/plain; charset=utf-8" },
+  });
+}
diff --git a/website/src/lib/llmsTxt.ts b/website/src/lib/llmsTxt.ts
@@ -0,0 +1,247 @@
+/**
+ * Shared logic for generating llms.txt content.
+ *
+ * Ported from app/scripts/generate-llms-txt.ts with minimal changes:
+ * - Replaced standalone fs/path scaffolding with existing website modules
+ * - Core transformation logic (charts, iframes, notebooks) is unchanged
+ */
+
+import { getPostsSorted, type BlogPost } from "@/data/posts/postTransformers";
+import { getArticleContent } from "@/lib/articles";
+import authorsData from "@/data/posts/authors.json";
+import type { AuthorsCollection } from "@/types/blog";
+
+const authors = authorsData as AuthorsCollection;
+
+/**
+ * Extract chart summary from Plotly JSON or figure caption.
+ * Returns a text description instead of the full JSON.
+ */
+function extractChartSummary(plotlyJson: string, caption?: string): string {
+  if (caption) {
+    return `[Chart: ${caption}]`;
+  }
+
+  try {
+    const chart = JSON.parse(plotlyJson);
+    const parts: string[] = [];
+
+    // Extract axis labels
+    if (chart.layout?.xaxis?.title?.text) {
+      parts.push(`x-axis: ${chart.layout.xaxis.title.text}`);
+    }
+    if (chart.layout?.yaxis?.title?.text) {
+      parts.push(`y-axis: ${chart.layout.yaxis.title.text}`);
+    }
+
+    // Extract chart type
+    if (chart.data?.[0]?.type) {
+      parts.push(`type: ${chart.data[0].type}`);
+    }
+
+    if (parts.length > 0) {
+      return `[Chart: ${parts.join(", ")}]`;
+    }
+  } catch {
+    // Failed to parse, use generic placeholder
+  }
+
+  return "[Chart: see original article]";
+}
+
+/**
+ * Transform article content for LLM consumption:
+ * - Replace Plotly JSON with text summaries
+ * - Keep iframe descriptions but remove the HTML
+ * - Preserve tables and text content
+ */
+function transformArticleContent(content: string): string {
+  let result = content;
+
+  // Find figure captions before plotly blocks
+  // Pattern: **Figure N: Caption**\n\n```plotly
+  const figurePattern =
+    /\*\*(?:Figure|Table)\s*\d*:?\s*([^*]+)\*\*\s*\n+```plotly\n([\s\S]*?)```/g;
+  result = result.replace(figurePattern, (_, caption, json) => {
+    const summary = extractChartSummary(json.trim(), caption.trim());
+    return `**${caption.trim()}**\n\n${summary}`;
+  });
+
+  // Handle plotly blocks without preceding captions
+  const plotlyPattern = /```plotly\n([\s\S]*?)```/g;
+  result = result.replace(plotlyPattern, (_, json) =>
+    extractChartSummary(json.trim()),
+  );
+
+  // Transform iframes to descriptions
+  const iframePattern =
+    /<iframe[^>]*src="([^"]*)"[^>]*(?:title="([^"]*)")?[^>]*><\/iframe>/g;
+  result = result.replace(iframePattern, (_, src, title) => {
+    if (title) {
+      return `[Interactive: ${title}]`;
+    }
+    // Extract meaningful part of URL
+    const urlParts = (src as string).split("/").filter(Boolean);
+    const lastPart = urlParts[urlParts.length - 1]?.replace(".html", "") || "";
+    const readable = lastPart.replace(/-/g, " ");
+    return `[Interactive: ${readable || "see original article"}]`;
+  });
+
+  return result;
+}
+
+/**
+ * Extract text content from Jupyter notebook JSON.
+ */
+function extractNotebookContent(raw: string): string {
+  try {
+    const nb = JSON.parse(raw) as {
+      cells: Array<{
+        cell_type: string;
+        source: string[];
+        outputs?: Array<{
+          output_type: string;
+          text?: string[];
+          data?: Record<string, string[]>;
+        }>;
+      }>;
+    };
+    const parts: string[] = [];
+
+    for (const cell of nb.cells) {
+      const source = cell.source.join("");
+      if (cell.cell_type === "markdown") {
+        parts.push(source);
+      } else if (cell.cell_type === "code") {
+        parts.push(`\`\`\`python\n${source}\n\`\`\``);
+        // Include text output if present
+        if (cell.outputs) {
+          for (const output of cell.outputs) {
+            if (output.text) {
+              parts.push(`\`\`\`\n${output.text.join("")}\n\`\`\``);
+            } else if (output.data?.["text/plain"]) {
+              parts.push(
+                `\`\`\`\n${output.data["text/plain"].join("")}\n\`\`\``,
+              );
+            }
+          }
+        }
+      }
+    }
+
+    return parts.join("\n\n");
+  } catch {
+    return raw;
+  }
+}
+
+/**
+ * Format a single article for llms.txt output.
+ */
+function formatArticle(post: BlogPost): string {
+  const authorNames = post.authors
+    .map((id) => authors[id]?.name || id)
+    .join(", ");
+
+  const rawContent = getArticleContent(post.filename);
+  const content = post.filename.endsWith(".ipynb")
+    ? extractNotebookContent(rawContent)
+    : rawContent;
+  const transformedContent = transformArticleContent(content);
+
+  return `---
+# ${post.title}
+Slug: ${post.slug}
+Date: ${post.date}
+Authors: ${authorNames}
+Tags: ${post.tags.join(", ")}
+Description: ${post.description}
+---
+
+${transformedContent}
+`;
+}
+
+/**
+ * Generate the main llms.txt index file.
+ */
+export function generateIndex(): string {
+  const posts = getPostsSorted();
+  const usPosts = posts.filter((p) => p.tags.includes("us"));
+  const ukPosts = posts.filter((p) => p.tags.includes("uk"));
+  const recentPosts = posts.slice(0, 10);
+
+  return `# PolicyEngine Research
+
+> PolicyEngine is a free, open-source tool for analyzing tax and benefit policy impacts through microsimulation modeling. We provide household calculators and society-wide impact analysis for the US and UK.
+
+## About PolicyEngine
+
+PolicyEngine enables users to:
+- Calculate how policy changes affect individual households
+- Estimate society-wide impacts on revenue, poverty, and inequality
+- Compare reform proposals across different scenarios
+- Access programmatic policy analysis via API
+
+## Recent Research
+
+${recentPosts.map((p) => `- [${p.title}](/research/${p.slug}): ${p.description}`).join("\n")}
+
+## Research by Region
+
+- [US Research](/llms-research-us.txt): ${usPosts.length} articles on US federal and state policy
+- [UK Research](/llms-research-uk.txt): ${ukPosts.length} articles on UK tax and benefit policy
+
+## Recent Research (Full Text)
+
+- [Recent Articles](/llms-recent.txt): Last 50 articles with full text
+
+## Full Archive
+
+- [All Research](/llms-full.txt): Complete archive of all PolicyEngine research articles
+
+## Documentation
+
+- [API Documentation](https://policyengine.org/us/api): Programmatic access to PolicyEngine
+- [Python Package](https://policyengine.github.io/policyengine-us/): policyengine-us documentation
+
+## Contact
+
+- Website: https://policyengine.org
+- GitHub: https://github.com/PolicyEngine
+- Email: hello@policyengine.org
+`;
+}
+
+/**
+ * Generate combined article file for a region, recent subset, or full archive.
+ */
+export function generateArticleFile(
+  region?: "us" | "uk",
+  limit?: number,
+): string {
+  const posts = getPostsSorted();
+  let filtered = region ? posts.filter((p) => p.tags.includes(region)) : posts;
+
+  let header: string;
+  if (limit) {
+    filtered = filtered.slice(0, limit);
+    header = `# PolicyEngine Recent Research\n\n> The ${filtered.length} most recent PolicyEngine research articles.\n\n`;
+  } else if (region) {
+    const regionLabel = region === "us" ? "US federal and state" : "UK";
+    header = `# PolicyEngine ${region.toUpperCase()} Research\n\n> ${regionLabel} tax and benefit policy analysis.\n\n`;
+  } else {
+    header = `# PolicyEngine Research Archive\n\n> Complete archive of PolicyEngine research articles.\n\n`;
+  }
+
+  const articles: string[] = [];
+  for (const post of filtered) {
+    try {
+      articles.push(formatArticle(post));
+    } catch {
+      // Skip articles whose files are missing
+    }
+  }
+
+  return header + articles.join("\n\n---\n\n");
+}