fix: detect service unavailability and fail fast with clear error (#118)

nicknisi · web-flow · commit 524c709ee864 · 2026-04-03T10:38:49.000-05:00
* fix: detect service unavailability and fail fast with clear error

When the Claude API returns persistent 500s, the SDK exhausts retries
and returns a result with subtype 'success' but is_error: true. Our
code only checked subtype, so it treated the error as success and
proceeded with validation retries — burning ~9 minutes on 30 hopeless
API calls before showing a raw JSON error.

Now:
- handleSDKMessage checks is_error on result messages
- 500/server_error/internal_error classified as SERVICE_UNAVAILABLE
- abortRetries flag skips validation retries on fatal SDK errors
- CLI adapter shows "AI service temporarily unavailable" instead of raw JSON
- Headless adapter emits service_unavailable error code

* chore: formatting

* fix: handle rate limit, network, and process exit errors with clear messages

Extend error classification to cover additional failure modes:
- 429/rate limit: "AI service is currently rate-limited"
- ECONNREFUSED/ETIMEDOUT/ENOTFOUND: "Could not connect to the AI service"
- Process exit: "AI agent process exited unexpectedly"

Rate limits also abort validation retries (same as 500s).

* fix: correct service-error regex and separate rate-limit handling

P1: The adapter regex /service.unavailable/ only matched a single char
between "service" and "unavailable", so it missed our own friendly
message "The AI service is temporarily unavailable". Fixed to
/service.*unavailable/. Also removed the "Agent SDK error:" prefix
from all framework integrations so user-friendly messages pass through
cleanly.

P2: 429 rate limits were folded into SERVICE_UNAVAILABLE_PREFIX, which
rewrote them to "temporarily unavailable" before adapters could see the
rate-limit signal. Now 429s get a separate RATE_LIMITED_PREFIX with
distinct messaging ("currently rate-limited"), while still aborting
validation retries.
diff --git a/src/integrations/dotnet/index.ts b/src/integrations/dotnet/index.ts
@@ -144,7 +144,7 @@ Begin integration now.`;
   if (agentResult.error) {
     await analytics.shutdown('error');
     const message = agentResult.errorMessage || agentResult.error;
-    throw new Error(`Agent SDK error: ${message}`);
+    throw new Error(message);
   }
 
   // Post-installation validation
diff --git a/src/integrations/elixir/index.ts b/src/integrations/elixir/index.ts
@@ -124,7 +124,7 @@ export async function run(options: InstallerOptions): Promise<string> {
   if (agentResult.error) {
     await analytics.shutdown('error');
     const message = agentResult.errorMessage || agentResult.error;
-    throw new Error(`Agent SDK error: ${message}`);
+    throw new Error(message);
   }
 
   // Build summary
diff --git a/src/integrations/go/index.ts b/src/integrations/go/index.ts
@@ -213,7 +213,7 @@ Begin integration now.`;
   if (agentResult.error) {
     await analytics.shutdown('error');
     const message = agentResult.errorMessage || agentResult.error;
-    throw new Error(`Agent SDK error: ${message}`);
+    throw new Error(message);
   }
 
   // Post-installation validation (gracefully skips — no rules file for Go)
diff --git a/src/integrations/ruby/index.ts b/src/integrations/ruby/index.ts
@@ -143,7 +143,7 @@ Begin integration now.`;
   if (agentResult.error) {
     await analytics.shutdown('error');
     const message = agentResult.errorMessage || agentResult.error;
-    throw new Error(`Agent SDK error: ${message}`);
+    throw new Error(message);
   }
 
   // Build completion summary
diff --git a/src/lib/adapters/cli-adapter.ts b/src/lib/adapters/cli-adapter.ts
@@ -402,7 +402,28 @@ export class CLIAdapter implements InstallerAdapter {
     this.stopSpinner('Error');
     this.stopAgentUpdates();
 
-    clack.log.error(message);
+    // Rewrite raw API/SDK errors into user-friendly messages
+    const isServiceError =
+      /\b50[0-9]\b/.test(message) || /server_error|internal_error|overloaded|service.*unavailable/i.test(message);
+    const isRateLimit = /\b429\b/.test(message) || /rate.limit/i.test(message);
+    const isNetworkError = /ECONNREFUSED|ETIMEDOUT|ENOTFOUND|fetch failed/i.test(message);
+    const isProcessExit = /process exited with code/i.test(message);
+
+    if (isServiceError) {
+      clack.log.error('The AI service is temporarily unavailable.');
+      clack.log.info('This is usually resolved within a few minutes. Please try again shortly.');
+    } else if (isRateLimit) {
+      clack.log.error('The AI service is currently rate-limited.');
+      clack.log.info('Please wait a minute and try again.');
+    } else if (isNetworkError) {
+      clack.log.error('Could not connect to the AI service.');
+      clack.log.info('Check your internet connection and try again.');
+    } else if (isProcessExit) {
+      clack.log.error('The AI agent process exited unexpectedly.');
+      clack.log.info('Try running again. If this persists, run with --debug for details.');
+    } else {
+      clack.log.error(message);
+    }
 
     // Add actionable hints for common errors
     if (message.includes('authentication') || message.includes('auth')) {
diff --git a/src/lib/adapters/headless-adapter.ts b/src/lib/adapters/headless-adapter.ts
@@ -336,7 +336,30 @@ export class HeadlessAdapter implements InstallerAdapter {
   };
 
   private handleError = ({ message, stack }: InstallerEvents['error']): void => {
-    writeNDJSON({ type: 'error', code: 'installer_error', message });
+    const isServiceError =
+      /\b50[0-9]\b/.test(message) || /server_error|internal_error|overloaded|service.*unavailable/i.test(message);
+    const isRateLimit = /\b429\b/.test(message) || /rate.limit/i.test(message);
+    const isNetworkError = /ECONNREFUSED|ETIMEDOUT|ENOTFOUND|fetch failed/i.test(message);
+    const isProcessExit = /process exited with code/i.test(message);
+
+    let code = 'installer_error';
+    let displayMessage = message;
+
+    if (isServiceError) {
+      code = 'service_unavailable';
+      displayMessage = 'The AI service is temporarily unavailable. Please try again in a few minutes.';
+    } else if (isRateLimit) {
+      code = 'rate_limited';
+      displayMessage = 'The AI service is currently rate-limited. Please wait a minute and try again.';
+    } else if (isNetworkError) {
+      code = 'network_error';
+      displayMessage = 'Could not connect to the AI service. Check your internet connection and try again.';
+    } else if (isProcessExit) {
+      code = 'process_error';
+      displayMessage = 'The AI agent process exited unexpectedly. Try running again with --debug for details.';
+    }
+
+    writeNDJSON({ type: 'error', code, message: displayMessage });
     this.debugLog(stack ?? '');
   };
 }
diff --git a/src/lib/agent-interface.spec.ts b/src/lib/agent-interface.spec.ts
@@ -67,16 +67,21 @@ vi.mock('../utils/urls.js', () => ({
   getLlmGatewayUrlFromHost: vi.fn(() => 'http://localhost:8000'),
 }));
 
-import { runAgent } from './agent-interface.js';
+import { runAgent, AgentErrorType } from './agent-interface.js';
 import { InstallerEventEmitter } from './events.js';
 import type { InstallerOptions } from '../utils/types.js';
 
 /**
  * Create a mock SDK response that consumes the prompt stream and yields
  * responses for each prompt message. This models the real SDK behavior:
  * the response generator stays alive as long as prompts keep coming.
+ *
+ * Turn options:
+ * - text: assistant text to yield
+ * - error: result subtype is 'error' with errors array
+ * - is_error: result has subtype 'success' but is_error: true (SDK exhausted retries)
  */
-function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean }>) {
+function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean; is_error?: boolean }>) {
   return function mockQueryImpl({ prompt }: { prompt: AsyncIterable<unknown>; options: unknown }) {
     let turnIndex = 0;
 
@@ -102,6 +107,7 @@ function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean }>)
         yield {
           type: 'result',
           subtype: turn.error ? 'error' : 'success',
+          is_error: turn.is_error ?? false,
           result: turn.text ?? '',
           ...(turn.error ? { errors: ['Test error'] } : {}),
         };
@@ -265,3 +271,96 @@ describe('runAgent retry loop', () => {
     expect(validateAndFormat).toHaveBeenCalledTimes(1);
   });
 });
+
+describe('service unavailability handling', () => {
+  let emitter: InstallerEventEmitter;
+  let emittedEvents: Array<{ event: string; payload: unknown }>;
+
+  beforeEach(() => {
+    mockQuery.mockReset();
+    emitter = new InstallerEventEmitter();
+    emittedEvents = [];
+
+    const originalEmit = emitter.emit.bind(emitter);
+    emitter.emit = ((event: string, payload: unknown) => {
+      emittedEvents.push({ event, payload });
+      return originalEmit(event, payload);
+    }) as typeof emitter.emit;
+  });
+
+  it('detects is_error result with API 500 as SERVICE_UNAVAILABLE', async () => {
+    const apiErrorText = 'API Error: 500 {"error":{"type":"internal_error","message":"An unexpected error occurred"}}';
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: apiErrorText, is_error: true }]));
+
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
+
+    expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
+    expect(result.errorMessage).toMatch(/temporarily unavailable/);
+  });
+
+  it('detects is_error result with server_error as SERVICE_UNAVAILABLE', async () => {
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: 'server_error: service overloaded', is_error: true }]));
+
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
+
+    expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
+  });
+
+  it('detects is_error result without service pattern as EXECUTION_ERROR', async () => {
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Some other failure', is_error: true }]));
+
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
+
+    expect(result.error).toBe(AgentErrorType.EXECUTION_ERROR);
+    expect(result.errorMessage).toBe('Some other failure');
+  });
+
+  it('skips validation retries when service is unavailable', async () => {
+    const apiErrorText = 'API Error: 500 {"error":{"type":"internal_error","message":"An unexpected error occurred"}}';
+    mockQuery.mockImplementation(createMockSDKResponse([{ text: apiErrorText, is_error: true }]));
+
+    const validateAndFormat = vi.fn().mockResolvedValue('Still broken');
+
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
+      maxRetries: 2,
+      validateAndFormat,
+    });
+
+    expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
+    // validateAndFormat should never be called because retries are aborted
+    expect(validateAndFormat).not.toHaveBeenCalled();
+
+    // No retry events should be emitted
+    const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry');
+    expect(retryEvents).toHaveLength(0);
+  });
+
+  it('detects 429 rate limit as distinct from service unavailability', async () => {
+    mockQuery.mockImplementation(
+      createMockSDKResponse([{ text: 'API Error: 429 rate_limit_exceeded', is_error: true }]),
+    );
+
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
+
+    expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
+    expect(result.errorMessage).toMatch(/rate-limited/);
+    expect(result.errorMessage).not.toMatch(/temporarily unavailable/);
+  });
+
+  it('skips validation retries when rate-limited', async () => {
+    mockQuery.mockImplementation(
+      createMockSDKResponse([{ text: 'API Error: 429 rate_limit_exceeded', is_error: true }]),
+    );
+
+    const validateAndFormat = vi.fn().mockResolvedValue('Still broken');
+
+    const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
+      maxRetries: 2,
+      validateAndFormat,
+    });
+
+    expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
+    expect(result.errorMessage).toMatch(/rate-limited/);
+    expect(validateAndFormat).not.toHaveBeenCalled();
+  });
+});
diff --git a/src/lib/agent-interface.ts b/src/lib/agent-interface.ts
@@ -56,6 +56,12 @@ export const AgentSignals = {
 
 export type AgentSignal = (typeof AgentSignals)[keyof typeof AgentSignals];
 
+/** Internal prefix used to tag service-unavailability errors from handleSDKMessage */
+const SERVICE_UNAVAILABLE_PREFIX = '__SERVICE_UNAVAILABLE__';
+
+/** Internal prefix used to tag rate-limit errors from handleSDKMessage */
+const RATE_LIMITED_PREFIX = '__RATE_LIMITED__';
+
 /**
  * Error types that can be returned from agent execution.
  * These correspond to the error signals that the agent emits.
@@ -67,6 +73,8 @@ export enum AgentErrorType {
   RESOURCE_MISSING = 'INSTALLER_RESOURCE_MISSING',
   /** Agent execution failed (API error, auth error, etc.) */
   EXECUTION_ERROR = 'INSTALLER_EXECUTION_ERROR',
+  /** AI service is unavailable (API 500, outage, etc.) */
+  SERVICE_UNAVAILABLE = 'INSTALLER_SERVICE_UNAVAILABLE',
 }
 
 export type AgentConfig = {
@@ -536,6 +544,11 @@ export async function runAgent(
     let resolveCurrentTurn!: () => void;
     let currentTurnDone!: Promise<void>;
 
+    // Set by the message loop when a fatal SDK error is detected (e.g. service
+    // unavailability).  The prompt stream checks this before yielding retry
+    // prompts so we fail fast instead of burning minutes on hopeless retries.
+    let abortRetries = false;
+
     function resetTurnSignal() {
       currentTurnDone = new Promise<void>((resolve) => {
         resolveCurrentTurn = resolve;
@@ -555,6 +568,12 @@ export async function runAgent(
         while (retryCount < maxRetries) {
           await currentTurnDone;
 
+          // Don't send correction prompts when the service itself is down
+          if (abortRetries) {
+            logInfo('Skipping validation retries due to service error');
+            break;
+          }
+
           emitter?.emit('validation:retry:start', { attempt: retryCount + 1 });
 
           let validationPrompt: string | null;
@@ -628,6 +647,8 @@ export async function runAgent(
       const messageError = handleSDKMessage(message, options, collectedText, emitter);
       if (messageError) {
         sdkError = messageError;
+        // Signal the prompt stream to stop yielding retry prompts
+        abortRetries = true;
       }
       if (message.type === 'result') {
         resolveCurrentTurn();
@@ -645,6 +666,22 @@ export async function runAgent(
     // Check for SDK errors first (e.g., API errors, auth failures)
     // Return error type + message - caller decides whether to throw or emit events
     if (sdkError) {
+      if (sdkError.startsWith(SERVICE_UNAVAILABLE_PREFIX)) {
+        const detail = sdkError.slice(SERVICE_UNAVAILABLE_PREFIX.length);
+        logError('AI service unavailable:', detail);
+        return {
+          error: AgentErrorType.SERVICE_UNAVAILABLE,
+          errorMessage: 'The AI service is temporarily unavailable. Please try again in a few minutes.',
+        };
+      }
+      if (sdkError.startsWith(RATE_LIMITED_PREFIX)) {
+        const detail = sdkError.slice(RATE_LIMITED_PREFIX.length);
+        logError('AI service rate-limited:', detail);
+        return {
+          error: AgentErrorType.SERVICE_UNAVAILABLE,
+          errorMessage: 'The AI service is currently rate-limited. Please wait a minute and try again.',
+        };
+      }
       logError('Agent SDK error:', sdkError);
       return { error: AgentErrorType.EXECUTION_ERROR, errorMessage: sdkError };
     }
@@ -837,6 +874,26 @@ function handleSDKMessage(
     }
 
     case 'result': {
+      // The SDK may return subtype 'success' with is_error: true when API
+      // retries are exhausted (e.g., persistent 500s). Check is_error first.
+      const isResultError = (message as Record<string, unknown>).is_error === true;
+
+      if (isResultError) {
+        const resultText = typeof message.result === 'string' ? message.result : '';
+        logError('Agent result marked as error:', resultText);
+
+        // Detect rate limiting (429) — check before 5xx so it gets distinct messaging
+        if (/\b429\b/.test(resultText) || /rate.limit/i.test(resultText)) {
+          return `${RATE_LIMITED_PREFIX}${resultText}`;
+        }
+
+        // Detect service unavailability (API 500, upstream outage)
+        if (/\b50[0-9]\b/.test(resultText) || /server_error|internal_error|overloaded/.test(resultText)) {
+          return `${SERVICE_UNAVAILABLE_PREFIX}${resultText}`;
+        }
+        return resultText || 'Agent execution failed';
+      }
+
       if (message.subtype === 'success') {
         logInfo('Agent completed successfully');
         if (typeof message.result === 'string') {
diff --git a/src/lib/agent-runner.ts b/src/lib/agent-runner.ts
@@ -139,7 +139,9 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
   if (agentResult.error) {
     await analytics.shutdown('error');
     const message = agentResult.errorMessage || agentResult.error;
-    throw new Error(`Agent SDK error: ${message}`);
+    // Pass user-friendly messages through without wrapping them in
+    // "Agent SDK error:" — that prefix obscures the actionable text.
+    throw new Error(message);
   }
 
   // Track retry metrics

Original file line number	Diff line number	Diff line change
@@ -144,7 +144,7 @@ Begin integration now.`;
`144`	`144`	`if (agentResult.error) {`
`145`	`145`	`await analytics.shutdown('error');`
`146`	`146`	`const message = agentResult.errorMessage \|\| agentResult.error;`
`147`		- throw new Error(`Agent SDK error: ${message}`);
	`147`	`+ throw new Error(message);`
`148`	`148`	`}`
`149`	`149`
`150`	`150`	`// Post-installation validation`
Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ export async function run(options: InstallerOptions): Promise<string> {`
`124`	`124`	`if (agentResult.error) {`
`125`	`125`	`await analytics.shutdown('error');`
`126`	`126`	`const message = agentResult.errorMessage \|\| agentResult.error;`
`127`		- throw new Error(`Agent SDK error: ${message}`);
	`127`	`+ throw new Error(message);`
`128`	`128`	`}`
`129`	`129`
`130`	`130`	`// Build summary`
Original file line number	Diff line number	Diff line change
@@ -213,7 +213,7 @@ Begin integration now.`;
`213`	`213`	`if (agentResult.error) {`
`214`	`214`	`await analytics.shutdown('error');`
`215`	`215`	`const message = agentResult.errorMessage \|\| agentResult.error;`
`216`		- throw new Error(`Agent SDK error: ${message}`);
	`216`	`+ throw new Error(message);`
`217`	`217`	`}`
`218`	`218`
`219`	`219`	`// Post-installation validation (gracefully skips — no rules file for Go)`
Original file line number	Diff line number	Diff line change
@@ -143,7 +143,7 @@ Begin integration now.`;
`143`	`143`	`if (agentResult.error) {`
`144`	`144`	`await analytics.shutdown('error');`
`145`	`145`	`const message = agentResult.errorMessage \|\| agentResult.error;`
`146`		- throw new Error(`Agent SDK error: ${message}`);
	`146`	`+ throw new Error(message);`
`147`	`147`	`}`
`148`	`148`
`149`	`149`	`// Build completion summary`