Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/architecture/YAML-TRUST-BOUNDARY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# YAML Trust Boundary

`src/mcp_server_python_docs/data/synonyms.yaml` is the project's only packaged
YAML data input. It is shipped inside the wheel and read through
`importlib.resources`; users do not provide YAML at runtime.

The file is parsed only with `yaml.safe_load` in these call sites:

- `src/mcp_server_python_docs/server.py` when the MCP server starts.
- `src/mcp_server_python_docs/ingestion/sphinx_json.py` when ingestion populates
the synonym table.

There are no `yaml.load` or `yaml.unsafe_load` parser call sites in `src/` or
`tests/`. The regression test
`tests/test_synonyms.py::test_yaml_loaded_only_via_safe_load` scans source files
and tests for unsafe YAML loaders, confirms both expected source `safe_load`
call sites, and asserts that `synonyms.yaml` is the only YAML file under
`src/mcp_server_python_docs/`.

Recommended future `SECURITY.md` wording for human review:

> The server parses only one packaged YAML input, `synonyms.yaml`, using
> `yaml.safe_load`; user-supplied YAML is not accepted.
44 changes: 44 additions & 0 deletions tests/test_synonyms.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
- Key concepts are present
"""
import importlib.resources
import re
from pathlib import Path

import yaml

Expand Down Expand Up @@ -76,3 +78,45 @@ def test_importlib_resources_path(self):
assert path.exists(), f"synonyms.yaml not found at {path}"
content = path.read_text()
assert len(content) > 0, "synonyms.yaml is empty"


def test_yaml_loaded_only_via_safe_load():
"""Lock in the packaged-YAML trust boundary for synonyms.yaml."""
repo_root = Path(__file__).resolve().parents[1]
src_root = repo_root / "src"
scan_roots = (src_root, repo_root / "tests")
expected_yaml_input = (
Comment thread
coderabbiteu[bot] marked this conversation as resolved.
"src/mcp_server_python_docs/data/synonyms.yaml"
)
expected_safe_load_sites = {
"src/mcp_server_python_docs/server.py",
"src/mcp_server_python_docs/ingestion/sphinx_json.py",
}

unsafe_load_call = re.compile(r"\byaml[.]load\s*[(]")
unsafe_loader_name = re.compile(r"\byaml[.]unsafe_load\b")
safe_load_call = re.compile(r"\byaml[.]safe_load\s*[(]")

violations: list[str] = []
safe_load_sites: set[str] = set()

for scan_root in scan_roots:
for source_path in sorted(scan_root.rglob("*.py")):
relative_path = source_path.relative_to(repo_root).as_posix()
for line_number, line in enumerate(
source_path.read_text(encoding="utf-8").splitlines(), 1
):
if unsafe_load_call.search(line) or unsafe_loader_name.search(line):
violations.append(f"{relative_path}:{line_number}: unsafe YAML load")
if source_path.is_relative_to(src_root) and safe_load_call.search(line):
safe_load_sites.add(relative_path)

yaml_inputs = sorted(
path.relative_to(repo_root).as_posix()
for path in src_root.rglob("*")
if path.suffix in {".yaml", ".yml"}
)

assert violations == []
assert expected_safe_load_sites <= safe_load_sites
assert yaml_inputs == [expected_yaml_input]