From 8ed5a365ae2f16ae7cb628ef8a5e4932d4ca2d2f Mon Sep 17 00:00:00 2001 From: "bluecloud-gilfoyle[bot]" <262642412+bluecloud-gilfoyle[bot]@users.noreply.github.com> Date: Sat, 30 May 2026 12:53:12 +0000 Subject: [PATCH] agent: ingestion pin CPython source SHAs --- .../agent-context/cpython-source-sha-pin.md | 25 ++++++++---- src/mcp_server_python_docs/__main__.py | 36 ++++++++++++++++- .../ingestion/cpython_versions.py | 35 +++++++++++++---- tests/test_ingestion.py | 39 +++++++++++++++++++ 4 files changed, 120 insertions(+), 15 deletions(-) diff --git a/.planning/agent-context/cpython-source-sha-pin.md b/.planning/agent-context/cpython-source-sha-pin.md index 8b22546..ca177f9 100644 --- a/.planning/agent-context/cpython-source-sha-pin.md +++ b/.planning/agent-context/cpython-source-sha-pin.md @@ -57,11 +57,22 @@ ## 5. Decision log - Resolved SHAs (tag → 40-hex commit), one line each: - - 3.10 / v3.10.20 → - - 3.11 / v3.11.15 → - - 3.12 / v3.12.13 → - - 3.13 / v3.13.13 → - - 3.14 / v3.14.4 → -- Where/how the verification aborts on mismatch: + - 3.10 / v3.10.20 → 842e987df856a5d4db37933c62a3456930a19092 + - 3.11 / v3.11.15 → 2340a037f7450e70fccfe411e6531afb4d57a312 + - 3.12 / v3.12.13 → 3bb231a6a5dc02b95658877318bf61501a7209e9 + - 3.13 / v3.13.13 → 01104ce1beb3135c2e0c01ec835b994c1f55a1c0 + - 3.14 / v3.14.4 → 23116f998f6789d8c2fbe5ed5b8146854c8c2a4f +- Where/how the verification aborts on mismatch: after the shallow + tag-based clone in `build-index`, `git -C rev-parse HEAD` is + compared to the authoritative config SHA. A mismatch logs the version, tag, + actual SHA, and expected SHA, then raises `SystemExit(1)` before Sphinx setup + or content ingestion can proceed. - **Draft SECURITY.md threat-model paragraph (for Vision to apply):** - > + > The largest build-time supply-chain input is the `build-index` clone of the + > upstream CPython repository, which provides the source tree used to generate + > canonical documentation content. Each supported CPython docs release is pinned + > to the exact commit SHA that its human-readable release tag currently resolves + > to; the tag is retained for operator readability, but the SHA is the + > authoritative integrity anchor. If a tag is reissued, moved, or otherwise + > resolves to different source content, the build fails before Sphinx setup or + > content ingestion rather than silently publishing changed documentation. diff --git a/src/mcp_server_python_docs/__main__.py b/src/mcp_server_python_docs/__main__.py index 4034529..97170be 100644 --- a/src/mcp_server_python_docs/__main__.py +++ b/src/mcp_server_python_docs/__main__.py @@ -65,6 +65,8 @@ def _consume_saved_stdout_fd() -> int: logger = logging.getLogger("mcp_server_python_docs") # === Now safe to import everything else === +import subprocess # noqa: E402 + import click # noqa: E402 from mcp_server_python_docs.ingestion.cpython_versions import ( # noqa: E402 @@ -110,6 +112,33 @@ def serve() -> None: pass # Client disconnected (HYGN-03) +def _verify_cpython_source_sha( + clone_dir: str, + *, + version: str, + tag: str, + expected_sha: str, +) -> None: + """Abort the docs build if a CPython tag resolves to unexpected content.""" + rev_parse = subprocess.run( + ["git", "-C", clone_dir, "rev-parse", "HEAD"], + check=True, + capture_output=True, + text=True, + ) + actual_sha = rev_parse.stdout.strip() + if actual_sha != expected_sha: + logger.error( + "CPython %s source integrity check failed: tag %s " + "resolved to %s, expected %s. Aborting build.", + version, + tag, + actual_sha, + expected_sha, + ) + raise SystemExit(1) + + @main.command("build-index") @click.option( "--versions", @@ -124,7 +153,6 @@ def serve() -> None: def build_index(versions: str, skip_content: bool) -> None: """Build the documentation index from objects.inv and Sphinx JSON.""" import shutil - import subprocess import tempfile import venv from pathlib import Path @@ -224,6 +252,12 @@ def build_index(versions: str, skip_content: bool) -> None: capture_output=True, text=True, ) + _verify_cpython_source_sha( + clone_dir, + version=version, + tag=config["tag"], + expected_sha=config["sha"], + ) # Create dedicated Sphinx venv (INGR-C-02) venv_dir = os.path.join(clone_dir, "_sphinx_venv") diff --git a/src/mcp_server_python_docs/ingestion/cpython_versions.py b/src/mcp_server_python_docs/ingestion/cpython_versions.py index eed3a49..2103d3d 100644 --- a/src/mcp_server_python_docs/ingestion/cpython_versions.py +++ b/src/mcp_server_python_docs/ingestion/cpython_versions.py @@ -8,6 +8,7 @@ class CPythonDocsBuildConfig(TypedDict): """Build settings for one CPython documentation release.""" tag: str + sha: str sphinx_pin: str @@ -21,12 +22,32 @@ class CPythonDocsBuildConfig(TypedDict): SUPPORTED_DOC_VERSIONS_CSV: Final[str] = ",".join(SUPPORTED_DOC_VERSIONS) -# CPython git tags are pinned so content builds are reproducible and do not -# drift when a maintenance branch receives new commits. +# CPython git SHAs are authoritative for content build integrity. Tags are kept +# for human-readable version mapping, but a moved tag must fail verification. CPYTHON_DOCS_BUILD_CONFIG: Final[dict[str, CPythonDocsBuildConfig]] = { - "3.10": {"tag": "v3.10.20", "sphinx_pin": "sphinx==3.4.3"}, - "3.11": {"tag": "v3.11.15", "sphinx_pin": "sphinx~=7.2.0"}, - "3.12": {"tag": "v3.12.13", "sphinx_pin": "sphinx~=8.2.0"}, - "3.13": {"tag": "v3.13.13", "sphinx_pin": "sphinx<9.0.0"}, - "3.14": {"tag": "v3.14.4", "sphinx_pin": "sphinx<9.0.0"}, + "3.10": { + "tag": "v3.10.20", + "sha": "842e987df856a5d4db37933c62a3456930a19092", + "sphinx_pin": "sphinx==3.4.3", + }, + "3.11": { + "tag": "v3.11.15", + "sha": "2340a037f7450e70fccfe411e6531afb4d57a312", + "sphinx_pin": "sphinx~=7.2.0", + }, + "3.12": { + "tag": "v3.12.13", + "sha": "3bb231a6a5dc02b95658877318bf61501a7209e9", + "sphinx_pin": "sphinx~=8.2.0", + }, + "3.13": { + "tag": "v3.13.13", + "sha": "01104ce1beb3135c2e0c01ec835b994c1f55a1c0", + "sphinx_pin": "sphinx<9.0.0", + }, + "3.14": { + "tag": "v3.14.4", + "sha": "23116f998f6789d8c2fbe5ed5b8146854c8c2a4f", + "sphinx_pin": "sphinx<9.0.0", + }, } diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 5fa3c3e..e158dea 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -9,8 +9,10 @@ import io import os +import re import runpy import shutil +import subprocess import sys import types @@ -51,8 +53,45 @@ def test_supported_versions_have_pinned_docs_build_config(self): for version in SUPPORTED_DOC_VERSIONS: config = CPYTHON_DOCS_BUILD_CONFIG[version] assert config["tag"].startswith(f"v{version}.") + assert re.fullmatch(r"[0-9a-f]{40}", config["sha"]) assert config["sphinx_pin"].startswith("sphinx") + def test_cpython_source_sha_verification_aborts_on_mismatch( + self, + monkeypatch, + caplog, + ): + from mcp_server_python_docs import __main__ as cli_main + + calls: list[list[str]] = [] + + def fake_run( + cmd: list[str], + *, + check: bool, + capture_output: bool, + text: bool, + ) -> subprocess.CompletedProcess[str]: + calls.append(cmd) + assert check is True + assert capture_output is True + assert text is True + return subprocess.CompletedProcess(cmd, 0, stdout="b" * 40 + "\n") + + monkeypatch.setattr(cli_main.subprocess, "run", fake_run) + + with pytest.raises(SystemExit) as exc_info: + cli_main._verify_cpython_source_sha( + "/tmp/cpython-3.14", + version="3.14", + tag="v3.14.4", + expected_sha="a" * 40, + ) + + assert exc_info.value.code == 1 + assert calls == [["git", "-C", "/tmp/cpython-3.14", "rev-parse", "HEAD"]] + assert "source integrity check failed" in caplog.text + class TestJsonBuildRequirements: def test_omits_html_only_sphinx_extensions(self, tmp_path):