diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..d040911 Binary files /dev/null and b/.DS_Store differ diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..6991075 --- /dev/null +++ b/.env.template @@ -0,0 +1,5 @@ +# Local port forwarded +PROMETHEUS_URL="http://localhost:9090" +MCP_SERVER_URL="http://localhost:8000/mcp" +# Default +#OLLAMA_HOST="http://localhost:11434" diff --git a/CLAUDE.md b/CLAUDE.md index 57fdb15..c8dd90c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,15 +16,28 @@ Project context for Claude Code. Updated as the project evolves. services/ mcp_k8s_server/ app/ - server.py # FastMCP server — defines @mcp.tool() functions - k8s_client.py # Thin wrapper around the kubernetes Python client + server.py # FastMCP server — defines @mcp.tool() functions + k8s_client.py # Thin wrapper around the kubernetes Python client + prometheus_client.py # Prometheus HTTP API client (PromQL queries) + tests/ + test_smoke.py # Smoke tests with FakeK8sClient agent_chatbot/ app/ agent.py # Interactive LLM chatbot with agentic tool loop + tests/ + .gitkeep # Placeholder so git tracks the empty directory deploy/ rbac-readonly.yaml # K8s RBAC for in-cluster service account docker/ mcp-server.Dockerfile + agent.Dockerfile +scripts/ + run_tests.sh # pytest runner; treats exit codes 4/5 as success +.github/ + workflows/ + ci.yaml # CI: runs tests on push/PR to main +cluster.yaml # k3d cluster definition (name: k8s-agent, 1 server, 2 agents) +docker-compose.yaml pyproject.toml # uv-managed dependencies ``` @@ -71,9 +84,9 @@ Defined in `server.py`, implemented in `k8s_client.py`: | Tool | Signature | Returns | |---|---|---| | `list_namespaces` | `() -> list[str]` | Namespace name strings | -| `list_pods` | `(namespace: str) -> list[str]` | Pod name strings in that namespace | - -**Planned**: Enrich `list_pods` to return status dicts (phase, ready, restart_count, reason) so the LLM can identify CrashLoopBackOff pods. Add `read_pod_log` tool. +| `list_pods` | `(namespace: str) -> list[dict]` | Pod status dicts (name, phase, ready, restart_count, reason) | +| `read_pod_log` | `(namespace: str, pod: str, container: str \| None, tail_lines: int) -> str` | Last N lines of pod logs | +| `query_prometheus` | `(query: str) -> list[dict]` | Instant PromQL query results (metric labels, value, timestamp) | --- @@ -100,15 +113,14 @@ def list_pods(namespace: str) -> list[dict]: ``` ### Tool Return Types -MCP tools must return JSON-serializable types. The kubernetes Python client returns `V1Pod` and similar objects that **cannot** be serialized — always extract fields explicitly in `server.py`: +MCP tools must return JSON-serializable types. The kubernetes Python client returns `V1Pod` and similar objects that **cannot** be serialized — always extract fields explicitly into plain dicts or strings. In this project, `k8s_client.py` handles extraction so `server.py` tools can return its output directly: ```python -# Wrong — V1Pod is not serializable +# k8s_client.py extracts fields into a plain dict — safe to return from MCP tool return k8s_client.list_pods(namespace) -# Correct — extract what you need -pods = k8s_client.list_pods(namespace) -return [p.metadata.name for p in pods] +# Never return raw kubernetes client objects from a tool +return core_api.list_namespaced_pod(namespace=namespace) # Wrong — not serializable ``` ### Tool Design Philosophy @@ -123,8 +135,8 @@ return [p.metadata.name for p in pods] ## Workflow Preferences - Do not commit by default. Make code changes and stop. The user reviews `git diff` before deciding to commit. Only commit when explicitly asked. -- Active branch: `mcp_enhancement`. If the branch does not exist, check for the current branch and switch to it, then update this file. -- The worktree `claude/gallant-turing` should be kept in sync with `mcp_enhancement` when resuming sessions (`git reset --hard `). +- Active branch: `mcp_even_more_tools`. If the branch does not exist, check for the current branch and switch to it, then update this file. +- The worktree `claude/gallant-turing` should be kept in sync with `mcp_even_more_tools` when resuming sessions (`git reset --hard `). - When committing, always use the `Co-Authored-By: Claude Sonnet 4.6 ` trailer. - The user uses PyCharm (`.idea/` present) and ruff for linting. - Avoid the use of emojis and em-dashes in any veribage or documentation created @@ -134,19 +146,51 @@ return [p.metadata.name for p in pods] ## Planned Features -1. **Enrich `list_pods`** with status fields (phase, ready, restart_count, reason) -2. **`read_pod_log` tool** — already in `k8s_client.py`, needs MCP tool wrapper -3. **`get_events` tool** — Kubernetes events are often the first place to look when troubleshooting -4. **FastAPI alerting webhook** — stateless endpoint that accepts alert payloads (Prometheus/Alertmanager format), runs the agent, returns structured diagnosis. Persistent MCP client via FastAPI lifespan, asyncio.Lock for concurrent request safety. -5. **Agentic loop safety** — consider a max iterations guard on the `while True` loop in `run_turn()` +1. **`get_events` tool** — Kubernetes events are often the first place to look when troubleshooting +2. **FastAPI alerting webhook** — stateless endpoint that accepts alert payloads (Prometheus/Alertmanager format), runs the agent, returns structured diagnosis. Persistent MCP client via FastAPI lifespan, asyncio.Lock for concurrent request safety. +3. **Agentic loop safety** — consider a max iterations guard on the `while True` loop in `run_turn()` + +--- + +## Environment Variables + +Configuration is managed via a `.env` file that is not committed to version control. + +- `.env.template` — committed, contains all variables with safe defaults and masked secrets +- `.env` — local only, listed in `.gitignore`, created by copying the template + +**Convention**: whenever a new env var is added, update `.env.template` with a safe default or masked placeholder (e.g. `API_KEY="your-api-key-here"`). Never put real credentials in `.env.template`. + +| Variable | Default | Used by | +|---|---|---| +| `PROMETHEUS_URL` | `http://localhost:9090` | `prometheus_client.py` | +| `MCP_SERVER_URL` | `http://localhost:8000/mcp` | `agent.py` | +| `OLLAMA_HOST` | `http://localhost:11434` | ollama client (auto-detected) | --- ## Local Development -Start the k3d cluster before running either service: +Copy the env template before first run: ```bash -k3d cluster start +cp .env.template .env +``` + +Create and start the k3d cluster (required for K8s tools): +```bash +make cluster-create # first time only +make cluster-start +``` + +Start Ollama and pull the model (required for the agent): +```bash +ollama serve # starts the Ollama server on localhost:11434 +ollama pull llama3.1:8b +``` + +Or use the Makefile target which does both: +```bash +make ollama ``` Run the MCP server: @@ -154,7 +198,12 @@ Run the MCP server: uv run python -m services.mcp_k8s_server.app.server ``` -Run the agent chatbot: +Run the agent chatbot (requires MCP server already running): ```bash uv run python -m services.agent_chatbot.app.agent ``` + +Or start both together with: +```bash +make start +``` diff --git a/Makefile b/Makefile index b26e362..a7df61e 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,14 @@ VERSION := 0.1.0 -.PHONY: help ollama server agent start lint format test compose-agent compose-up pre-commit-enable pre-commit-disable +.PHONY: help ollama server agent start lint format test compose-agent compose-up pre-commit-enable pre-commit-disable cluster-create cluster-start cluster-stop blackbox-install blackbox-uninstall help: @printf "k8s-agent-mcp Makefile help\n\n" @printf "Usage: make \n\n" @printf "Common targets (run 'make '):\n" + @printf " cluster-create - Create the k3d cluster from cluster.yaml (first time only).\n" + @printf " cluster-start - Start the k3d cluster.\n" + @printf " cluster-stop - Stop the k3d cluster.\n" @printf " ollama - Pulls the LLM model (ollama) and starts the Ollama server.\n" @printf " server - Starts the MCP k8s server (FastMCP) in the foreground.\n" @printf " agent - Runs the interactive agent chatbot locally (requires server).\n" @@ -16,7 +19,9 @@ help: @printf " format - Run ruff to autoformat code.\n" @printf " test - Run the project's pytest test suite for services.\n" @printf " pre-commit-enable - Install pre-commit hooks into .git/hooks.\n" - @printf " pre-commit-disable - Remove pre-commit hooks from .git/hooks.\n\n" + @printf " pre-commit-disable - Remove pre-commit hooks from .git/hooks.\n" + @printf " blackbox-install - Install blackbox exporter and Probe CR for example-web latency monitoring.\n" + @printf " blackbox-uninstall - Remove blackbox exporter and Probe CR.\n\n" @printf "Notes:\n" @printf " - 'make' with no args shows this help (default).\n" @printf " - Use 'make compose-agent' to run the agent interactively inside Docker.\n" @@ -25,6 +30,18 @@ help: ## ── Local development ──────────────────────────────────────────────────────── +# Create the k3d cluster from cluster.yaml (first time only). +cluster-create: + k3d cluster create --config cluster.yaml + +# Start the k3d cluster. +cluster-start: + k3d cluster start k8s-agent + +# Stop the k3d cluster. +cluster-stop: + k3d cluster stop k8s-agent + # Pull the LLM model if not already cached, then start the Ollama server. # Run this in a separate terminal before starting the agent. ollama: @@ -46,6 +63,8 @@ start: @uv run python -m services.mcp_k8s_server.app.server & \ SERVER_PID=$$!; \ trap "kill $$SERVER_PID" EXIT; \ + echo "Waiting for MCP server..."; \ + until curl -so /dev/null -H "Accept: text/event-stream" http://localhost:8000/mcp 2>/dev/null; do sleep 0.3; done; \ uv run python -m services.agent_chatbot.app.agent # Convenience: run the agent via docker-compose interactively @@ -57,6 +76,22 @@ compose-agent: compose-up: docker compose up -d mcp-server agent +## ── Blackbox exporter ──────────────────────────────────────────────────────── + +# Install prometheus-blackbox-exporter into the default namespace. +# fullnameOverride keeps the service name short: blackbox-exporter.default.svc +blackbox-install: + helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + helm repo update + helm upgrade --install blackbox-exporter prometheus-community/prometheus-blackbox-exporter \ + --namespace default \ + --set fullnameOverride=blackbox-exporter + kubectl apply -f deploy/workloads/blackbox-probe.yaml + +blackbox-uninstall: + kubectl delete -f deploy/workloads/blackbox-probe.yaml --ignore-not-found + helm uninstall blackbox-exporter --namespace default + ## ── Pre-commit ─────────────────────────────────────────────────────────────── # Install pre-commit hooks so they run automatically on every commit. diff --git a/README.md b/README.md index e32a688..c203e17 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,29 @@ [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) -[![CI](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/docker-build-test.yaml/badge.svg)](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/ci.yml) +[![CI](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/ci.yaml/badge.svg)](https://github.com/spackle0/k8s-agent-mcp/actions/workflows/ci.yaml) [![codecov](https://codecov.io/gh/spackle0/k8s-agent-mcp/graph/badge.svg?token=YJVD7W9Q37)](https://codecov.io/gh/spackle0/k8s-agent-mcp) An experiment in Agentic AI with a Kubernetes slant +## Setup + +Copy the environment template before running anything locally: + +```bash +cp .env.template .env +``` + +Edit `.env` to match your local environment. The file is listed in `.gitignore` and will not be committed. See `.env.template` for all available variables and their defaults. + +Create the k3d cluster (first time only): + +```bash +k3d cluster create --config cluster.yaml +``` + ## Docker Compose (interactive agent) The `agent` service is interactive — the container keeps STDIN open and allocates a TTY so you can type directly into the running Python process. diff --git a/cluster.yaml b/cluster.yaml new file mode 100644 index 0000000..fdda4fd --- /dev/null +++ b/cluster.yaml @@ -0,0 +1,7 @@ +apiVersion: k3d.io/v1alpha5 +kind: Simple +metadata: + name: k8s-agent +servers: 1 +agents: 2 +image: rancher/k3s:v1.33.3-k3s1 \ No newline at end of file diff --git a/deploy/.DS_Store b/deploy/.DS_Store new file mode 100644 index 0000000..596e7c2 Binary files /dev/null and b/deploy/.DS_Store differ diff --git a/deploy/chaosmesh/chaosmesg-pod-failure-5m.yaml b/deploy/chaosmesh/chaosmesg-pod-failure-5m.yaml new file mode 100644 index 0000000..7892cf2 --- /dev/null +++ b/deploy/chaosmesh/chaosmesg-pod-failure-5m.yaml @@ -0,0 +1,14 @@ +kind: PodChaos +apiVersion: chaos-mesh.org/v1alpha1 +metadata: + namespace: default + name: pod-test-1 +spec: + selector: + namespaces: + - default + labelSelectors: + app: example-web + mode: all + action: pod-failure + duration: 5m diff --git a/deploy/chaosmesh/chaosmesh-network-delay.yaml b/deploy/chaosmesh/chaosmesh-network-delay.yaml new file mode 100644 index 0000000..74f4296 --- /dev/null +++ b/deploy/chaosmesh/chaosmesh-network-delay.yaml @@ -0,0 +1,15 @@ +apiVersion: chaos-mesh.org/v1alpha1 +kind: NetworkChaos +metadata: + name: network-delay-example + namespace: default +spec: + action: delay + mode: all + selector: + labelSelectors: + app: example-web + delay: + latency: "200ms" + jitter: "50ms" + duration: "15m" \ No newline at end of file diff --git a/deploy/chaosmesh/chaosmesh-pod-cpu-stress.yaml b/deploy/chaosmesh/chaosmesh-pod-cpu-stress.yaml new file mode 100644 index 0000000..4495667 --- /dev/null +++ b/deploy/chaosmesh/chaosmesh-pod-cpu-stress.yaml @@ -0,0 +1,15 @@ +apiVersion: chaos-mesh.org/v1alpha1 +kind: StressChaos +metadata: + name: cpu-stress-example + namespace: default +spec: + mode: one + selector: + labelSelectors: + app: example-web + stressors: + cpu: + workers: 2 + load: 80 + duration: "30s" \ No newline at end of file diff --git a/deploy/chaosmesh/chaosmesh-pod-kill-cron.yaml b/deploy/chaosmesh/chaosmesh-pod-kill-cron.yaml new file mode 100644 index 0000000..3394c07 --- /dev/null +++ b/deploy/chaosmesh/chaosmesh-pod-kill-cron.yaml @@ -0,0 +1,14 @@ +apiVersion: chaos-mesh.org/v1alpha1 +kind: Schedule +metadata: + name: pod-kill-example + namespace: default +spec: + schedule: "*/2 * * * *" + type: PodChaos + podChaos: + action: pod-kill + mode: one + selector: + labelSelectors: + app: example-web \ No newline at end of file diff --git a/deploy/chaosmesh/chaosmesh-rbac.yaml b/deploy/chaosmesh/chaosmesh-rbac.yaml new file mode 100644 index 0000000..0732889 --- /dev/null +++ b/deploy/chaosmesh/chaosmesh-rbac.yaml @@ -0,0 +1,32 @@ +kind: ServiceAccount +apiVersion: v1 +metadata: + namespace: default + name: chaos-mesh-dashboard + +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: chaos-mesh-dashboard +rules: +- apiGroups: [""] + resources: ["pods", "namespaces", "nodes", "events"] + verbs: ["get", "watch", "list"] +- apiGroups: ["chaos-mesh.org"] + resources: ["*"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: chaos-mesh-dashboard +subjects: +- kind: ServiceAccount + name: chaos-mesh-dashboard + namespace: default +roleRef: + kind: ClusterRole + name: chaos-mesh-dashboard + apiGroup: rbac.authorization.k8s.io diff --git a/deploy/workloads/blackbox-probe.yaml b/deploy/workloads/blackbox-probe.yaml new file mode 100644 index 0000000..e621536 --- /dev/null +++ b/deploy/workloads/blackbox-probe.yaml @@ -0,0 +1,35 @@ +# blackbox-probe.yaml +# Probe CR tells Prometheus Operator to scrape blackbox exporter results for example-web. +# Requires blackbox exporter installed via Helm (see Makefile: make blackbox-install). +# +# Key metrics exposed after this is applied: +# probe_success - 1 if HTTP 2xx, 0 if not +# probe_duration_seconds - full round-trip time including any injected latency +# probe_http_status_code - HTTP response code +# +# To watch latency during a Chaos Mesh NetworkChaos experiment: +# kubectl -n default get probe example-web +# PromQL: probe_duration_seconds{job="example-web-blackbox"} +apiVersion: monitoring.coreos.com/v1 +kind: Probe +metadata: + name: example-web-blackbox + namespace: default + labels: + app: example-web + release: prometheus +spec: + jobName: example-web-blackbox + interval: 5s + module: http_2xx + prober: + url: blackbox-exporter.default.svc.cluster.local:9115 + scheme: http + path: /probe + targets: + staticConfig: + static: + - http://example-web.default.svc.cluster.local:8080 + labels: + app: example-web + namespace: default diff --git a/deploy/workloads/test-workload.yaml b/deploy/workloads/test-workload.yaml new file mode 100644 index 0000000..4adf9aa --- /dev/null +++ b/deploy/workloads/test-workload.yaml @@ -0,0 +1,69 @@ +# test-workload.yaml +# Sets up a workload to be monitored by Prometheus +apiVersion: apps/v1 +kind: Deployment +metadata: + name: example-web + namespace: default + labels: + app: example-web +spec: + replicas: 2 + selector: + matchLabels: + app: example-web + template: + metadata: + labels: + app: example-web + spec: + containers: + - name: web + image: quay.io/brancz/prometheus-example-app:v0.5.0 + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: 50m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: example-web + namespace: default + labels: + app: example-web +spec: + selector: + app: example-web + ports: + - port: 8080 + targetPort: http + name: http +--- +# kubectl api-resources --api-group=monitoring.coreos.com +# Installed by prometheus helm chart +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: example-web + namespace: default + labels: + app: example-web + release: prometheus # matches the helm release name "helm list -A" +spec: + selector: + matchLabels: + app: example-web + endpoints: + - port: http + # Careful how short you make this to avoid network saturation + # And for any rates in prometheus, be sure to have at least 2x this value + # since rate needs at least two data points + interval: 5s + path: /metrics diff --git a/pyproject.toml b/pyproject.toml index 4a34372..28e1c0f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ test = [ # ruff configuration [tool.ruff] show-fixes = true -select = [ +lint.select = [ "E", # pycodestyle errors "W", # pycodestyle warnings "F", # pyflakes diff --git a/services/agent_chatbot/app/agent.py b/services/agent_chatbot/app/agent.py index 8d188e9..e861cf6 100644 --- a/services/agent_chatbot/app/agent.py +++ b/services/agent_chatbot/app/agent.py @@ -19,6 +19,7 @@ import os from fastmcp import Client +from fastmcp.exceptions import ToolError from typing import Any @@ -67,8 +68,7 @@ def format_tools_for_log(tools: dict) -> str: for name, tool in tools.items(): # Summarize each parameter as "name: type (required)" or "name: type". params = ", ".join( - f"{k}: {v.get('type', '?')}" - + (" (required)" if k in tool.inputSchema.get("required", []) else "") + f"{k}: {v.get('type', '?')}" + (" (required)" if k in tool.inputSchema.get("required", []) else "") for k, v in tool.inputSchema.get("properties", {}).items() ) lines.append(f" {name}({params})") @@ -88,8 +88,11 @@ async def call_tool(client: Client, tool_name: str, tool_arguments: dict[str, An connection. Extracts the plain text from the first content item so the LLM receives a clean string rather than a raw result object. """ - result = await client.call_tool(tool_name, tool_arguments) - return result.content[0].text if result.content else "" + try: + result = await client.call_tool(tool_name, tool_arguments) + return result.content[0].text if result.content else "" + except ToolError as e: + return f"Tool error: {e}" async def get_tools(client: Client) -> dict: @@ -174,8 +177,11 @@ async def main(): print("Goodbye.") break - answer = await run_turn(client, available_tools, ollama_tools, messages, user_input) - print(f"Assistant: {answer}\n") + try: + answer = await run_turn(client, available_tools, ollama_tools, messages, user_input) + print(f"Assistant: {answer}\n") + except Exception as e: + print(f"Error: {e}\n") if __name__ == "__main__": diff --git a/services/mcp_k8s_server/app/k8s_client.py b/services/mcp_k8s_server/app/k8s_client.py index feefff0..5e427b0 100644 --- a/services/mcp_k8s_server/app/k8s_client.py +++ b/services/mcp_k8s_server/app/k8s_client.py @@ -4,21 +4,13 @@ """ from functools import cache - from kubernetes import client, config class K8sClient: - """Connection manager for Kubernetes API clients. - - Responsible for creating, refreshing, and providing access to the - underlying Kubernetes API client objects (CoreV1Api and AppsV1Api). - """ + """Holds initialized Kubernetes API clients.""" def __init__(self) -> None: - self._init_clients() - - def _init_clients(self) -> None: try: config.load_incluster_config() except config.ConfigException: @@ -35,29 +27,6 @@ def apps(self) -> client.AppsV1Api: """Return the AppsV1Api instance.""" return self._apps_api - def refresh(self) -> None: - """Reload configuration and recreate API clients.""" - self._init_clients() - - def close(self) -> None: - """Attempt to close underlying ApiClient connections if supported. - - This is best-effort; the kubernetes ApiClient exposes a `close()` on - its `api_client` attribute which we call when available. - """ - try: - if hasattr(self._core_api, "api_client") and hasattr(self._core_api.api_client, "close"): - self._core_api.api_client.close() - except Exception: - # Best-effort close; swallow exceptions to avoid noisy shutdown - pass - - try: - if hasattr(self._apps_api, "api_client") and hasattr(self._apps_api.api_client, "close"): - self._apps_api.api_client.close() - except Exception: - pass - @cache def get_client() -> K8sClient: @@ -65,26 +34,9 @@ def get_client() -> K8sClient: return K8sClient() -def refresh_client() -> None: - """Refresh the cached client's credentials/configuration.""" - get_client().refresh() - - -def close_client() -> None: - """Close the cached client and clear the cache so a new instance will be created. - - Use this when credentials or network resources change and you want a fresh - client object on next access. - """ - try: - get_client().close() - finally: - # Clear the cached instance so get_client() will construct a new one. - get_client.cache_clear() - - -# Module-level convenience functions (preserve previous public API) def list_namespaces() -> list[str]: + """Return a list of namespaces in a kubernetes cluster. + """ core_api = get_client().core() ns = core_api.list_namespace() return [n.metadata.name for n in ns.items] @@ -106,20 +58,20 @@ def list_pods(namespace: str) -> list[dict]: pods = core_api.list_namespaced_pod(namespace=namespace) result: list[dict] = [] - for p in pods.items: - name = getattr(p.metadata, "name", "") - phase = getattr(p.status, "phase", "Unknown") + for pod in pods.items: + name = getattr(pod.metadata, "name", "") + phase = getattr(pod.status, "phase", "Unknown") # Determine readiness: all container statuses must be ready ready = True restart_count = 0 reason = None - container_statuses = getattr(p.status, "container_statuses", None) or [] - for cs in container_statuses: - ready = ready and bool(getattr(cs, "ready", False)) - restart_count += int(getattr(cs, "restart_count", 0)) + container_statuses = getattr(pod.status, "container_statuses", None) or [] + for status in container_statuses: + ready = ready and bool(getattr(status, "ready", False)) + restart_count += int(getattr(status, "restart_count", 0)) # If a waiting state is present, prefer that reason - state = getattr(cs, "state", None) + state = getattr(status, "state", None) if state: waiting = getattr(state, "waiting", None) if waiting and getattr(waiting, "reason", None): @@ -127,20 +79,27 @@ def list_pods(namespace: str) -> list[dict]: # Fall back to pod-level reason if none found if not reason: - reason = getattr(p.status, "reason", None) - - result.append({ - "name": name, - "phase": phase, - "ready": ready, - "restart_count": restart_count, - "reason": reason, - }) + reason = getattr(pod.status, "reason", None) + + result.append( + { + "name": name, + "phase": phase, + "ready": ready, + "restart_count": restart_count, + "reason": reason, + } + ) return result def read_pod_log(namespace: str, pod: str, container: str | None = None, tail_lines: int = 20) -> str: + """Return the last tail_lines lines of logs for a pod's container. + + If container is None, the pod's default container is used. Returns a plain + string. Returns an empty string if no logs are available. + """ core_api = get_client().core() return core_api.read_namespaced_pod_log( name=pod, diff --git a/services/mcp_k8s_server/app/prometheus_client.py b/services/mcp_k8s_server/app/prometheus_client.py new file mode 100644 index 0000000..33bdf6f --- /dev/null +++ b/services/mcp_k8s_server/app/prometheus_client.py @@ -0,0 +1,50 @@ +""" +Prometheus HTTP API client for instant PromQL queries. +""" + +import os + +import httpx + +PROMETHEUS_URL = os.getenv("PROMETHEUS_URL", "http://localhost:9090") + + +def query(promql: str) -> list[dict]: + """Execute an instant PromQL query and return the result vector. + + Sends a GET request to the Prometheus HTTP API and returns a list of + dicts. Each dict contains the metric labels and the current sample value. + Returns an empty list if no time series match the query. + + Raises httpx.HTTPStatusError on non-2xx responses. + Raises RuntimeError if Prometheus reports a query error. + """ + response = httpx.get( + f"{PROMETHEUS_URL}/api/v1/query", + params={"query": promql}, + timeout=10, + ) + response.raise_for_status() + body = response.json() + + if body["status"] != "success": + raise RuntimeError(f"Prometheus query error: {body.get('error', 'unknown')}") + + result_type = body["data"]["resultType"] + results = body["data"]["result"] + + if result_type == "vector": + return [ + { + "metric": item["metric"], + "value": item["value"][1], + "timestamp": item["value"][0], + } + for item in results + ] + + if result_type == "scalar": + return [{"value": results[1], "timestamp": results[0]}] + + # matrix or string — return raw result and let the LLM interpret it + return results diff --git a/services/mcp_k8s_server/app/server.py b/services/mcp_k8s_server/app/server.py index d9d08f9..7c3ffac 100644 --- a/services/mcp_k8s_server/app/server.py +++ b/services/mcp_k8s_server/app/server.py @@ -5,11 +5,14 @@ # streamable-HTTP and call these tools by name. # # Tools exposed: -# - list_namespaces() → all namespace names in the cluster +# - list_namespaces() → all namespace names in the cluster +# - list_pods() → pod status dicts for a given namespace +# - read_pod_log() → last N lines of a pod's container logs +# - query_prometheus() → instant PromQL query against Prometheus from fastmcp import FastMCP -from services.mcp_k8s_server.app import k8s_client +from services.mcp_k8s_server.app import k8s_client, prometheus_client # Create the FastMCP server instance. The name is metadata clients can read # but does not affect routing or tool resolution. @@ -52,6 +55,27 @@ def read_pod_log(namespace: str, pod: str, container: str | None = None, tail_li return logs or "" +@mcp.tool() +def query_prometheus(query: str) -> list[dict]: + """Execute an instant PromQL query against Prometheus and return matching time series. + + Takes a PromQL query string and returns a list of dicts, each with: + - metric: dict of label key/value pairs identifying the time series + (e.g. {"pod": "my-pod", "namespace": "default", "job": "kubelet"}) + - value: the current sample value as a string (e.g. "1", "0.042") + - timestamp: Unix timestamp of the sample as a float + + Returns an empty list if no time series match the query. + + Example queries: + - up + - container_cpu_usage_seconds_total{namespace="default"} + - kube_pod_container_status_restarts_total > 0 + - rate(http_requests_total[5m]) + """ + return prometheus_client.query(query) + + def main(): # Start the FastMCP server using the streamable-HTTP transport. # By default, this listens on port 8000 at /mcp, matching MCP_SERVER_URL diff --git a/services/mcp_k8s_server/tests/test_smoke.py b/services/mcp_k8s_server/tests/test_smoke.py index a51ba95..bfa38d7 100644 --- a/services/mcp_k8s_server/tests/test_smoke.py +++ b/services/mcp_k8s_server/tests/test_smoke.py @@ -20,13 +20,15 @@ def list_namespaces(): @staticmethod def list_pods(namespace: str): - return [{ - "name": "mypod", - "phase": "Running", - "ready": True, - "restart_count": 0, - "reason": None, - }] + return [ + { + "name": "mypod", + "phase": "Running", + "ready": True, + "restart_count": 0, + "reason": None, + } + ] @staticmethod def read_pod_log(namespace: str, pod: str, container=None, tail_lines=20): @@ -35,7 +37,13 @@ def read_pod_log(namespace: str, pod: str, container=None, tail_lines=20): monkeypatch.setattr(server_app.k8s_client, "get_client", lambda: FakeK8sClient()) monkeypatch.setattr(server_app.k8s_client, "list_namespaces", lambda: FakeK8sClient().list_namespaces()) monkeypatch.setattr(server_app.k8s_client, "list_pods", lambda ns: FakeK8sClient().list_pods(ns)) - monkeypatch.setattr(server_app.k8s_client, "read_pod_log", lambda ns, p, container=None, tail_lines=20: FakeK8sClient().read_pod_log(ns, p, container=container, tail_lines=tail_lines)) + monkeypatch.setattr( + server_app.k8s_client, + "read_pod_log", + lambda ns, p, container=None, tail_lines=20: FakeK8sClient().read_pod_log( + ns, p, container=container, tail_lines=tail_lines + ), + ) def test_list_namespaces():