diff --git a/commands/run/triage.md b/commands/run/triage.md index a95ecc9..6af7030 100644 --- a/commands/run/triage.md +++ b/commands/run/triage.md @@ -46,6 +46,140 @@ migrateProjectState(); " 2>/dev/null || true ``` +**Checkpoint detection — check for resumable progress before stage routing:** + +After loading state and running migration, detect whether a prior pipeline run left +a checkpoint with meaningful progress (beyond triage). If found, present the user +with Resume/Fresh/Skip options before proceeding. + +```bash +# Detect checkpoint with progress beyond triage +CHECKPOINT_DATA=$(node -e " +const { detectCheckpoint, resumeFromCheckpoint } = require('./lib/state.cjs'); +const cp = detectCheckpoint(${ISSUE_NUMBER}); +if (!cp) { + console.log('none'); +} else { + const resume = resumeFromCheckpoint(${ISSUE_NUMBER}); + console.log(JSON.stringify(resume)); +} +" 2>/dev/null || echo "none") +``` + +If checkpoint is found (`CHECKPOINT_DATA !== "none"`): + +Parse the checkpoint data and display to the user: +```bash +CHECKPOINT_STEP=$(echo "$CHECKPOINT_DATA" | node -e " +const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf-8')); +console.log(d.checkpoint.pipeline_step); +") +RESUME_ACTION=$(echo "$CHECKPOINT_DATA" | node -e " +const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf-8')); +console.log(d.resumeAction); +") +RESUME_STAGE=$(echo "$CHECKPOINT_DATA" | node -e " +const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf-8')); +console.log(d.resumeStage); +") +COMPLETED_STEPS=$(echo "$CHECKPOINT_DATA" | node -e " +const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf-8')); +console.log(d.completedSteps.join(', ')); +") +ARTIFACTS_COUNT=$(echo "$CHECKPOINT_DATA" | node -e " +const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf-8')); +console.log(d.checkpoint.artifacts.length); +") +STARTED_AT=$(echo "$CHECKPOINT_DATA" | node -e " +const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf-8')); +console.log(d.checkpoint.started_at || 'unknown'); +") +UPDATED_AT=$(echo "$CHECKPOINT_DATA" | node -e " +const d=JSON.parse(require('fs').readFileSync('/dev/stdin','utf-8')); +console.log(d.checkpoint.updated_at || 'unknown'); +") +``` + +Display checkpoint state and prompt user: +``` +AskUserQuestion( + header: "Checkpoint Detected for #${ISSUE_NUMBER}", + question: "A prior pipeline run left progress at step '${CHECKPOINT_STEP}'. + +| | | +|---|---| +| **Last step** | ${CHECKPOINT_STEP} | +| **Completed steps** | ${COMPLETED_STEPS} | +| **Artifacts** | ${ARTIFACTS_COUNT} file(s) | +| **Resume action** | ${RESUME_ACTION} → stage: ${RESUME_STAGE} | +| **Started** | ${STARTED_AT} | +| **Last updated** | ${UPDATED_AT} | + +How would you like to proceed?", + options: [ + { label: "Resume", description: "Resume from checkpoint — skip completed steps (${COMPLETED_STEPS}), jump to ${RESUME_STAGE}" }, + { label: "Fresh", description: "Discard checkpoint and re-run pipeline from scratch" }, + { label: "Skip", description: "Skip this issue entirely" } + ] +) +``` + +Handle user choice: + +| Choice | Action | +|--------|--------| +| **Resume** | Load checkpoint context. Set `pipeline_stage` in state to `${RESUME_STAGE}`. Log: "MGW: Resuming #${ISSUE_NUMBER} from checkpoint (step: ${CHECKPOINT_STEP}, action: ${RESUME_ACTION})." Skip triage/worktree stages that already completed and jump directly to the resume stage in the pipeline. The `resume.context` object carries step-specific data (e.g., `quick_dir`, `plan_num`, `phase_number`) needed by the target stage. | +| **Fresh** | Clear checkpoint via `clearCheckpoint()`. Reset `pipeline_stage` to `"triaged"`. Log: "MGW: Checkpoint cleared for #${ISSUE_NUMBER}. Starting fresh." Continue with normal pipeline flow. | +| **Skip** | Log: "MGW: Skipping #${ISSUE_NUMBER} per user request." STOP pipeline. | + +```bash +case "$USER_CHOICE" in + Resume) + # Load resume context and jump to the appropriate stage + node -e " + const fs = require('fs'), path = require('path'); + const activeDir = path.join(process.cwd(), '.mgw', 'active'); + const files = fs.readdirSync(activeDir); + const file = files.find(f => f.startsWith('${ISSUE_NUMBER}-') && f.endsWith('.json')); + const filePath = path.join(activeDir, file); + const state = JSON.parse(fs.readFileSync(filePath, 'utf-8')); + // The pipeline_stage already reflects prior progress — do not overwrite + // unless the resume target is more advanced than current stage + console.log('Resuming from checkpoint: ' + JSON.stringify(state.checkpoint.resume)); + " 2>/dev/null || true + # Set RESUME_MODE=true — downstream stages check this flag to skip completed work + RESUME_MODE=true + RESUME_CONTEXT="${CHECKPOINT_DATA}" + ;; + Fresh) + node -e " + const { clearCheckpoint } = require('./lib/state.cjs'); + clearCheckpoint(${ISSUE_NUMBER}); + console.log('Checkpoint cleared for #${ISSUE_NUMBER}'); + " 2>/dev/null || true + # Reset pipeline_stage to triaged for fresh start + node -e " + const fs = require('fs'), path = require('path'); + const activeDir = path.join(process.cwd(), '.mgw', 'active'); + const files = fs.readdirSync(activeDir); + const file = files.find(f => f.startsWith('${ISSUE_NUMBER}-') && f.endsWith('.json')); + const filePath = path.join(activeDir, file); + const state = JSON.parse(fs.readFileSync(filePath, 'utf-8')); + state.pipeline_stage = 'triaged'; + fs.writeFileSync(filePath, JSON.stringify(state, null, 2)); + " 2>/dev/null || true + RESUME_MODE=false + ;; + Skip) + echo "MGW: Skipping #${ISSUE_NUMBER} per user request." + exit 0 + ;; +esac +``` + +If no checkpoint found (or checkpoint is at triage step only), continue with +normal pipeline stage routing below. + **Initialize checkpoint** when pipeline first transitions past triage: ```bash # Checkpoint initialization — called once when pipeline execution begins. @@ -67,7 +201,6 @@ updateCheckpoint(${ISSUE_NUMBER}, { }); " 2>/dev/null || true ``` - Check pipeline_stage: - "triaged" → proceed to GSD execution - "planning" / "executing" → resume from where we left off diff --git a/commands/workflows/state.md b/commands/workflows/state.md index 652ac43..f5568c8 100644 --- a/commands/workflows/state.md +++ b/commands/workflows/state.md @@ -647,6 +647,91 @@ GSD phase directory (`.planning/phases/{NN}-{slug}/`) to operate in. Issues created outside of `/mgw:project` (e.g., manually filed bugs) will not have a `phase_number`. In this case, `/mgw:run` falls back to the quick pipeline. +## Checkpoint Resume Detection + +When `mgw:run` starts for an issue, the validate_and_load step checks whether a prior +pipeline run left a checkpoint with progress beyond the initial triage step. This enables +resuming interrupted sessions without re-doing completed work. + +### Resume Detection Functions (lib/state.cjs) + +| Function | Signature | Returns | Description | +|----------|-----------|---------|-------------| +| `detectCheckpoint` | `(issueNumber)` | `object\|null` | Checks if active state file has a non-null checkpoint with `pipeline_step` beyond `"triage"`. Returns the checkpoint data if resumable, `null` otherwise. | +| `resumeFromCheckpoint` | `(issueNumber)` | `object\|null` | Returns checkpoint data plus computed `resumeStage`, `resumeAction`, and `completedSteps`. Maps `resume.action` to the pipeline stage to jump to. | +| `clearCheckpoint` | `(issueNumber)` | `{ cleared: boolean }` | Resets the checkpoint field to `null` in the active state file. Used for "Fresh start" option. | + +### Resume Action to Stage Mapping + +The `resume.action` field in the checkpoint tells `resumeFromCheckpoint()` which pipeline +stage to jump to: + +| resume.action | resumeStage | Meaning | +|---------------|-------------|---------| +| `run-plan-checker` | `planning` | Plan exists, needs quality check | +| `spawn-executor` | `executing` | Plan complete, execute next | +| `continue-execution` | `executing` | Mid-execution resume | +| `spawn-verifier` | `verifying` | Execution done, verify next | +| `create-pr` | `pr-pending` | Verification done, create PR | +| `begin-execution` | `planning` | Triage done, begin planning | +| `null` / unknown | `planning` | Safe default | + +### Resume Detection Flow + +``` +mgw:run #N starts + | + v +Load state file → migrateProjectState() + | + v +detectCheckpoint(N) + | + +---> null (no checkpoint or triage-only) → proceed with normal stage routing + | + +---> checkpoint found → display state to user + | + v + AskUserQuestion: Resume / Fresh / Skip + | + +---> Resume: load checkpoint context, set RESUME_MODE=true, + | jump to resume.action stage (skip completed steps) + | + +---> Fresh: clearCheckpoint(N), reset pipeline_stage to "triaged", + | continue normal pipeline + | + +---> Skip: exit pipeline for this issue +``` + +### Pipeline Step Order + +The `CHECKPOINT_STEP_ORDER` constant defines the ordered progression of checkpoint steps: + +``` +triage → plan → execute → verify → pr +``` + +Only checkpoints with `pipeline_step` at index > 0 (beyond `"triage"`) are considered +resumable. A checkpoint at `"triage"` means nothing meaningful has been completed yet. + +### Resume Context + +When resuming, the `resume.context` object carries step-specific data needed by the +target stage. The context shape varies by `resume.action`: + +| resume.action | Context fields | +|---------------|----------------| +| `spawn-executor` | `{ quick_dir, plan_num }` | +| `run-plan-checker` | `{ quick_dir, plan_num }` | +| `spawn-verifier` | `{ quick_dir, plan_num }` | +| `create-pr` | `{ quick_dir, plan_num }` | +| `continue-execution` | `{ phase_number }` | +| `begin-execution` | `{ gsd_route, branch }` | + +Downstream pipeline stages read `resume.context` to pick up where the prior run left +off. For example, the executor stage uses `quick_dir` and `plan_num` to locate the +existing plan files rather than re-creating them. + ## Consumers | Pattern | Referenced By | @@ -661,5 +746,6 @@ a `phase_number`. In this case, `/mgw:run` falls back to the quick pipeline. | Project state | milestone.md, next.md, ask.md | | Gate result schema | issue.md (populate), run.md (validate) | | Board status sync | board-sync.md (utility), issue.md (triage transitions), run.md (pipeline transitions) | +| Checkpoint resume | run.md (detect + prompt), milestone.md (detect resume point for failed issues) | | Checkpoint writes | triage.md (init), execute.md (plan/execute/verify), pr-create.md (pr) | | Atomic writes | lib/state.cjs (`atomicWriteJson`, `updateCheckpoint`) | diff --git a/lib/state.cjs b/lib/state.cjs index 351434e..6487c6b 100644 --- a/lib/state.cjs +++ b/lib/state.cjs @@ -271,10 +271,14 @@ function resolveActiveMilestoneIndex(state) { const CHECKPOINT_SCHEMA_VERSION = 1; /** - * Create a new checkpoint object with default values. -* Called when pipeline execution begins (triage -> executing transition). + * Pipeline steps ordered by progression. Steps beyond 'triage' indicate + * meaningful work has been completed and a resume is possible. + */ +const CHECKPOINT_STEP_ORDER = ['triage', 'plan', 'execute', 'verify', 'pr']; -* Called when pipeline execution begins (triage → executing transition). +/** + * Create a new checkpoint object with default values. + * Called when pipeline execution begins (triage → executing transition). * * @param {string} [pipelineStep='triage'] - Initial pipeline step * @returns {object} Fresh checkpoint object @@ -298,7 +302,93 @@ function initCheckpoint(pipelineStep) { } /** -* Write a JSON state file atomically: serialize to a .tmp sibling, then rename. + * Detect whether an active issue has a checkpoint with progress beyond triage. + * + * Returns an object describing the checkpoint state if one exists and has + * progressed past the initial 'triage' step. Returns null if no checkpoint + * exists, checkpoint is null, or checkpoint is still at the 'triage' step + * (nothing meaningful to resume from). + * + * @param {number|string} issueNumber - Issue number to check + * @returns {{ pipeline_step: string, step_progress: object, artifacts: Array, resume: object, started_at: string, updated_at: string, step_history: Array }|null} + * Checkpoint data if resumable, null otherwise + */ +function detectCheckpoint(issueNumber) { + const issueState = loadActiveIssue(issueNumber); + if (!issueState) return null; + + const cp = issueState.checkpoint; + if (!cp || typeof cp !== 'object') return null; + + const step = cp.pipeline_step || 'triage'; + const stepIndex = CHECKPOINT_STEP_ORDER.indexOf(step); + + // Only consider checkpoints beyond 'triage' (index 0) as resumable + if (stepIndex <= 0) return null; + + return { + pipeline_step: cp.pipeline_step, + step_progress: cp.step_progress || {}, + artifacts: cp.artifacts || [], + resume: cp.resume || { action: null, context: {} }, + started_at: cp.started_at || null, + updated_at: cp.updated_at || null, + step_history: cp.step_history || [], + }; +} + +/** + * Load checkpoint data needed to resume a pipeline from where it left off. + * + * Returns the full checkpoint object plus computed metadata about which + * pipeline stage to jump to based on the checkpoint's resume.action field. + * + * Resume action → pipeline stage mapping: + * - "run-plan-checker" → planning (plan exists, needs checking) + * - "spawn-executor" → executing (plan complete, execute next) + * - "continue-execution" → executing (mid-execution resume) + * - "spawn-verifier" → verifying (execution done, verify next) + * - "create-pr" → pr-pending (verification done, create PR) + * - "begin-execution" → planning (triage done, begin planning) + * - null / unknown → planning (safe default) + * + * @param {number|string} issueNumber - Issue number to resume + * @returns {{ checkpoint: object, resumeStage: string, resumeAction: string, completedSteps: string[] }|null} + * Resume data if checkpoint exists, null otherwise + */ +function resumeFromCheckpoint(issueNumber) { + const cp = detectCheckpoint(issueNumber); + if (!cp) return null; + + const action = (cp.resume && cp.resume.action) || null; + + // Map resume action to the pipeline stage we should jump to + const actionToStage = { + 'run-plan-checker': 'planning', + 'spawn-executor': 'executing', + 'continue-execution': 'executing', + 'spawn-verifier': 'verifying', + 'create-pr': 'pr-pending', + 'begin-execution': 'planning', + }; + + const resumeStage = actionToStage[action] || 'planning'; + + // Derive completed steps from step_history + const completedSteps = (cp.step_history || []) + .map(entry => entry.step) + .filter(Boolean); + + return { + checkpoint: cp, + resumeStage, + resumeAction: action || 'unknown', + completedSteps, + }; +} + +/** + * Write a JSON state file atomically: serialize to a .tmp sibling, then rename. * This prevents corruption from interrupts (SIGINT, crash, context timeout) by * ensuring the file is either fully written or not written at all. * @@ -318,12 +408,51 @@ function atomicWriteJson(filePath, data) { } /** - * Merge checkpoint data into an active issue state file. + * Clear the checkpoint for an issue, resetting it to null. * - * Performs a shallow merge of the provided data onto the existing checkpoint - * object -- existing fields not present in `data` are preserved. The `artifacts` + * Used when a user chooses "Fresh start" to discard checkpoint progress + * and re-run the pipeline from scratch. + * + * @param {number|string} issueNumber - Issue number to clear checkpoint for + * @returns {{ cleared: boolean }} Result indicating whether the checkpoint was cleared + * @throws {Error} If no state file found for the given issue number + */ +function clearCheckpoint(issueNumber) { + const activeDir = getActiveDir(); + if (!fs.existsSync(activeDir)) { + throw new Error(`No active directory found. Cannot clear checkpoint for #${issueNumber}.`); + } + + const prefix = String(issueNumber) + '-'; + let entries; + try { + entries = fs.readdirSync(activeDir); + } catch (err) { + throw new Error(`Cannot read active directory: ${err.message}`); + } + + const match = entries.find(f => f.startsWith(prefix) && f.endsWith('.json')); + if (!match) { + throw new Error(`No state file found for issue #${issueNumber}.`); + } + + const filePath = path.join(activeDir, match); + let issueState; + try { + issueState = JSON.parse(fs.readFileSync(filePath, 'utf-8')); + } catch (err) { + throw new Error(`Cannot parse state file for #${issueNumber}: ${err.message}`); + } + + const hadCheckpoint = issueState.checkpoint != null; + issueState.checkpoint = null; + atomicWriteJson(filePath, issueState); -* Merge checkpoint data into an active issue state file. + return { cleared: hadCheckpoint }; +} + +/** + * Merge checkpoint data into an active issue state file. * * Performs a shallow merge of the provided data onto the existing checkpoint * object — existing fields not present in `data` are preserved. The `artifacts` @@ -332,7 +461,7 @@ function atomicWriteJson(filePath, data) { * * If the issue has no checkpoint yet, one is initialized first via initCheckpoint(). * -* Writes are atomic by default: data is written to a .tmp file first, then + * Writes are atomic by default: data is written to a .tmp file first, then * renamed to the target path. This prevents corruption from interrupts. * * @param {number|string} issueNumber - Issue number to update @@ -673,8 +802,12 @@ module.exports = { migrateProjectState, resolveActiveMilestoneIndex, CHECKPOINT_SCHEMA_VERSION, + CHECKPOINT_STEP_ORDER, initCheckpoint, atomicWriteJson, + detectCheckpoint, + resumeFromCheckpoint, + clearCheckpoint, updateCheckpoint, loadCrossRefs, VALID_LINK_TYPES,