spackle0 · spackle0 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/.DS_Store b/.DS_Store
diff --git a/.env.template b/.env.template
@@ -0,0 +1,5 @@
+# Local port forwarded
+PROMETHEUS_URL="http://localhost:9090"
+MCP_SERVER_URL="http://localhost:8000/mcp"
+# Default
+#OLLAMA_HOST="http://localhost:11434"
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -16,15 +16,28 @@ Project context for Claude Code. Updated as the project evolves.
 services/
   mcp_k8s_server/
     app/
-      server.py       # FastMCP server — defines @mcp.tool() functions
-      k8s_client.py   # Thin wrapper around the kubernetes Python client
+      server.py             # FastMCP server — defines @mcp.tool() functions
+      k8s_client.py         # Thin wrapper around the kubernetes Python client
+      prometheus_client.py  # Prometheus HTTP API client (PromQL queries)
+    tests/
+      test_smoke.py   # Smoke tests with FakeK8sClient
   agent_chatbot/
     app/
       agent.py        # Interactive LLM chatbot with agentic tool loop
+    tests/
+      .gitkeep        # Placeholder so git tracks the empty directory
 deploy/
   rbac-readonly.yaml  # K8s RBAC for in-cluster service account
 docker/
   mcp-server.Dockerfile
+  agent.Dockerfile
+scripts/
+  run_tests.sh        # pytest runner; treats exit codes 4/5 as success
+.github/
+  workflows/
+    ci.yaml           # CI: runs tests on push/PR to main
+cluster.yaml          # k3d cluster definition (name: k8s-agent, 1 server, 2 agents)
+docker-compose.yaml
 pyproject.toml        # uv-managed dependencies
 ```
 
@@ -71,9 +84,9 @@ Defined in `server.py`, implemented in `k8s_client.py`:
 | Tool | Signature | Returns |
 |---|---|---|
 | `list_namespaces` | `() -> list[str]` | Namespace name strings |
-| `list_pods` | `(namespace: str) -> list[str]` | Pod name strings in that namespace |
-
-**Planned**: Enrich `list_pods` to return status dicts (phase, ready, restart_count, reason) so the LLM can identify CrashLoopBackOff pods. Add `read_pod_log` tool.
+| `list_pods` | `(namespace: str) -> list[dict]` | Pod status dicts (name, phase, ready, restart_count, reason) |
+| `read_pod_log` | `(namespace: str, pod: str, container: str \| None, tail_lines: int) -> str` | Last N lines of pod logs |
+| `query_prometheus` | `(query: str) -> list[dict]` | Instant PromQL query results (metric labels, value, timestamp) |
 
 ---
 
@@ -100,15 +113,14 @@ def list_pods(namespace: str) -> list[dict]:
 ```
 
 ### Tool Return Types
-MCP tools must return JSON-serializable types. The kubernetes Python client returns `V1Pod` and similar objects that **cannot** be serialized — always extract fields explicitly in `server.py`:
+MCP tools must return JSON-serializable types. The kubernetes Python client returns `V1Pod` and similar objects that **cannot** be serialized — always extract fields explicitly into plain dicts or strings. In this project, `k8s_client.py` handles extraction so `server.py` tools can return its output directly:
 
 ```python
-# Wrong — V1Pod is not serializable
+# k8s_client.py extracts fields into a plain dict — safe to return from MCP tool
 return k8s_client.list_pods(namespace)
 
-# Correct — extract what you need
-pods = k8s_client.list_pods(namespace)
-return [p.metadata.name for p in pods]
+# Never return raw kubernetes client objects from a tool
+return core_api.list_namespaced_pod(namespace=namespace)  # Wrong — not serializable
 ```
 
 ### Tool Design Philosophy
@@ -123,8 +135,8 @@ return [p.metadata.name for p in pods]
 ## Workflow Preferences
 
 - Do not commit by default. Make code changes and stop. The user reviews `git diff` before deciding to commit. Only commit when explicitly asked.
-- Active branch: `mcp_enhancement`. If the branch does not exist, check for the current branch and switch to it, then update this file.
-- The worktree `claude/gallant-turing` should be kept in sync with `mcp_enhancement` when resuming sessions (`git reset --hard <sha>`).
+- Active branch: `mcp_even_more_tools`. If the branch does not exist, check for the current branch and switch to it, then update this file.
+- The worktree `claude/gallant-turing` should be kept in sync with `mcp_even_more_tools` when resuming sessions (`git reset --hard <sha>`).
 - When committing, always use the `Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>` trailer.
 - The user uses PyCharm (`.idea/` present) and ruff for linting.
 - Avoid the use of emojis and em-dashes in any veribage or documentation created
@@ -134,27 +146,64 @@ return [p.metadata.name for p in pods]
 
 ## Planned Features
 
-1. **Enrich `list_pods`** with status fields (phase, ready, restart_count, reason)
-2. **`read_pod_log` tool** — already in `k8s_client.py`, needs MCP tool wrapper
-3. **`get_events` tool** — Kubernetes events are often the first place to look when troubleshooting
-4. **FastAPI alerting webhook** — stateless endpoint that accepts alert payloads (Prometheus/Alertmanager format), runs the agent, returns structured diagnosis. Persistent MCP client via FastAPI lifespan, asyncio.Lock for concurrent request safety.
-5. **Agentic loop safety** — consider a max iterations guard on the `while True` loop in `run_turn()`
+1. **`get_events` tool** — Kubernetes events are often the first place to look when troubleshooting
+2. **FastAPI alerting webhook** — stateless endpoint that accepts alert payloads (Prometheus/Alertmanager format), runs the agent, returns structured diagnosis. Persistent MCP client via FastAPI lifespan, asyncio.Lock for concurrent request safety.
+3. **Agentic loop safety** — consider a max iterations guard on the `while True` loop in `run_turn()`
+
+---
+
+## Environment Variables
+
+Configuration is managed via a `.env` file that is not committed to version control.
+
+- `.env.template` — committed, contains all variables with safe defaults and masked secrets
+- `.env` — local only, listed in `.gitignore`, created by copying the template
+
+**Convention**: whenever a new env var is added, update `.env.template` with a safe default or masked placeholder (e.g. `API_KEY="your-api-key-here"`). Never put real credentials in `.env.template`.
+
+| Variable | Default | Used by |
+|---|---|---|
+| `PROMETHEUS_URL` | `http://localhost:9090` | `prometheus_client.py` |
+| `MCP_SERVER_URL` | `http://localhost:8000/mcp` | `agent.py` |
+| `OLLAMA_HOST` | `http://localhost:11434` | ollama client (auto-detected) |
 
 ---
 
 ## Local Development
 
-Start the k3d cluster before running either service:
+Copy the env template before first run:
 ```bash
-k3d cluster start
+cp .env.template .env
+```
+
+Create and start the k3d cluster (required for K8s tools):
+```bash
+make cluster-create   # first time only
+make cluster-start
+```
+
+Start Ollama and pull the model (required for the agent):
+```bash
+ollama serve          # starts the Ollama server on localhost:11434
+ollama pull llama3.1:8b
+```
+
+Or use the Makefile target which does both:
+```bash
+make ollama
 ```
 
 Run the MCP server:
 ```bash
 uv run python -m services.mcp_k8s_server.app.server
 ```
 
-Run the agent chatbot:
+Run the agent chatbot (requires MCP server already running):
 ```bash
 uv run python -m services.agent_chatbot.app.agent
 ```
+
+Or start both together with:
+```bash
+make start
+```
diff --git a/Makefile b/Makefile
@@ -1,11 +1,14 @@
 VERSION := 0.1.0
 
-.PHONY: help ollama server agent start lint format test compose-agent compose-up pre-commit-enable pre-commit-disable
+.PHONY: help ollama server agent start lint format test compose-agent compose-up pre-commit-enable pre-commit-disable cluster-create cluster-start cluster-stop blackbox-install blackbox-uninstall
 
 help:
 	@printf "k8s-agent-mcp Makefile help\n\n"
 	@printf "Usage: make <target>\n\n"
 	@printf "Common targets (run 'make <target>'):\n"
+	@printf "  cluster-create - Create the k3d cluster from cluster.yaml (first time only).\n"
+	@printf "  cluster-start  - Start the k3d cluster.\n"
+	@printf "  cluster-stop   - Stop the k3d cluster.\n"
 	@printf "  ollama        - Pulls the LLM model (ollama) and starts the Ollama server.\n"
 	@printf "  server        - Starts the MCP k8s server (FastMCP) in the foreground.\n"
 	@printf "  agent         - Runs the interactive agent chatbot locally (requires server).\n"
@@ -16,7 +19,9 @@ help:
 	@printf "  format             - Run ruff to autoformat code.\n"
 	@printf "  test               - Run the project's pytest test suite for services.\n"
 	@printf "  pre-commit-enable  - Install pre-commit hooks into .git/hooks.\n"
-	@printf "  pre-commit-disable - Remove pre-commit hooks from .git/hooks.\n\n"
+	@printf "  pre-commit-disable - Remove pre-commit hooks from .git/hooks.\n"
+	@printf "  blackbox-install   - Install blackbox exporter and Probe CR for example-web latency monitoring.\n"
+	@printf "  blackbox-uninstall - Remove blackbox exporter and Probe CR.\n\n"
 	@printf "Notes:\n"
 	@printf "  - 'make' with no args shows this help (default).\n"
 	@printf "  - Use 'make compose-agent' to run the agent interactively inside Docker.\n"
@@ -25,6 +30,18 @@ help:
 
 ## ── Local development ────────────────────────────────────────────────────────
 
+# Create the k3d cluster from cluster.yaml (first time only).
+cluster-create:
+	k3d cluster create --config cluster.yaml
+
+# Start the k3d cluster.
+cluster-start:
+	k3d cluster start k8s-agent
+
+# Stop the k3d cluster.
+cluster-stop:
+	k3d cluster stop k8s-agent
+
 # Pull the LLM model if not already cached, then start the Ollama server.
 # Run this in a separate terminal before starting the agent.
 ollama:
@@ -46,6 +63,8 @@ start:
 	@uv run python -m services.mcp_k8s_server.app.server & \
 	SERVER_PID=$$!; \
 	trap "kill $$SERVER_PID" EXIT; \
+	echo "Waiting for MCP server..."; \
+	until curl -so /dev/null -H "Accept: text/event-stream" http://localhost:8000/mcp 2>/dev/null; do sleep 0.3; done; \
 	uv run python -m services.agent_chatbot.app.agent
 
 # Convenience: run the agent via docker-compose interactively
@@ -57,6 +76,22 @@ compose-agent:
 compose-up:
 	docker compose up -d mcp-server agent
 
+## ── Blackbox exporter ────────────────────────────────────────────────────────
+
+# Install prometheus-blackbox-exporter into the default namespace.
+# fullnameOverride keeps the service name short: blackbox-exporter.default.svc
+blackbox-install:
+	helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+	helm repo update
+	helm upgrade --install blackbox-exporter prometheus-community/prometheus-blackbox-exporter \
+		--namespace default \
+		--set fullnameOverride=blackbox-exporter
+	kubectl apply -f deploy/workloads/blackbox-probe.yaml
+
+blackbox-uninstall:
+	kubectl delete -f deploy/workloads/blackbox-probe.yaml --ignore-not-found
+	helm uninstall blackbox-exporter --namespace default
+
 ## ── Pre-commit ───────────────────────────────────────────────────────────────
 
 # Install pre-commit hooks so they run automatically on every commit.

diff --git a/README.md b/README.md
@@ -6,13 +6,29 @@
 
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
 
-[![CI](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/docker-build-test.yaml/badge.svg)](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/ci.yml)
+[![CI](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/ci.yaml/badge.svg)](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/ci.yaml)
 
 [![codecov](https://codecov.io/gh/spackle0/k8s-agent-mcp/graph/badge.svg?token=YJVD7W9Q37)](https://codecov.io/gh/spackle0/k8s-agent-mcp)
 
 
 An experiment in Agentic AI with a Kubernetes slant
 
+## Setup
+
+Copy the environment template before running anything locally:
+
+```bash
+cp .env.template .env
+```
+
+Edit `.env` to match your local environment. The file is listed in `.gitignore` and will not be committed. See `.env.template` for all available variables and their defaults.
+
+Create the k3d cluster (first time only):
+
+```bash
+k3d cluster create --config cluster.yaml
+```
+
 ## Docker Compose (interactive agent)
 
 The `agent` service is interactive — the container keeps STDIN open and allocates a TTY so you can type directly into the running Python process.

diff --git a/cluster.yaml b/cluster.yaml
@@ -0,0 +1,7 @@
+apiVersion: k3d.io/v1alpha5
+kind: Simple
+metadata:
+  name: k8s-agent
+servers: 1
+agents: 2
+image: rancher/k3s:v1.33.3-k3s1
diff --git a/deploy/.DS_Store b/deploy/.DS_Store
diff --git a/deploy/chaosmesh/chaosmesg-pod-failure-5m.yaml b/deploy/chaosmesh/chaosmesg-pod-failure-5m.yaml
@@ -0,0 +1,14 @@
+kind: PodChaos
+apiVersion: chaos-mesh.org/v1alpha1
+metadata:
+  namespace: default
+  name: pod-test-1
+spec:
+  selector:
+    namespaces:
+      - default
+    labelSelectors:
+      app: example-web
+  mode: all
+  action: pod-failure
+  duration: 5m
diff --git a/deploy/chaosmesh/chaosmesh-network-delay.yaml b/deploy/chaosmesh/chaosmesh-network-delay.yaml
@@ -0,0 +1,15 @@
+apiVersion: chaos-mesh.org/v1alpha1
+kind: NetworkChaos
+metadata:
+  name: network-delay-example
+  namespace: default
+spec:
+  action: delay
+  mode: all
+  selector:
+    labelSelectors:
+      app: example-web
+  delay:
+    latency: "200ms"
+    jitter: "50ms"
+  duration: "15m"
diff --git a/deploy/chaosmesh/chaosmesh-pod-cpu-stress.yaml b/deploy/chaosmesh/chaosmesh-pod-cpu-stress.yaml
@@ -0,0 +1,15 @@
+apiVersion: chaos-mesh.org/v1alpha1
+kind: StressChaos
+metadata:
+  name: cpu-stress-example
+  namespace: default
+spec:
+  mode: one
+  selector:
+    labelSelectors:
+      app: example-web
+  stressors:
+    cpu:
+      workers: 2
+      load: 80
+  duration: "30s"
diff --git a/deploy/chaosmesh/chaosmesh-pod-kill-cron.yaml b/deploy/chaosmesh/chaosmesh-pod-kill-cron.yaml
@@ -0,0 +1,14 @@
+apiVersion: chaos-mesh.org/v1alpha1
+kind: Schedule
+metadata:
+  name: pod-kill-example
+  namespace: default
+spec:
+  schedule: "*/2 * * * *"
+  type: PodChaos
+  podChaos:
+    action: pod-kill
+    mode: one
+    selector:
+      labelSelectors:
+        app: example-web
diff --git a/deploy/chaosmesh/chaosmesh-rbac.yaml b/deploy/chaosmesh/chaosmesh-rbac.yaml
@@ -0,0 +1,32 @@
+kind: ServiceAccount
+apiVersion: v1
+metadata:
+  namespace: default
+  name: chaos-mesh-dashboard
+
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: chaos-mesh-dashboard
+rules:
+- apiGroups: [""]
+  resources: ["pods", "namespaces", "nodes", "events"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: ["chaos-mesh.org"]
+  resources: ["*"]
+  verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: chaos-mesh-dashboard
+subjects:
+- kind: ServiceAccount
+  name: chaos-mesh-dashboard
+  namespace: default
+roleRef:
+  kind: ClusterRole
+  name: chaos-mesh-dashboard
+  apiGroup: rbac.authorization.k8s.io