Skip to content

Commit 524c709

Browse files
authored
fix: detect service unavailability and fail fast with clear error (#118)
* fix: detect service unavailability and fail fast with clear error When the Claude API returns persistent 500s, the SDK exhausts retries and returns a result with subtype 'success' but is_error: true. Our code only checked subtype, so it treated the error as success and proceeded with validation retries — burning ~9 minutes on 30 hopeless API calls before showing a raw JSON error. Now: - handleSDKMessage checks is_error on result messages - 500/server_error/internal_error classified as SERVICE_UNAVAILABLE - abortRetries flag skips validation retries on fatal SDK errors - CLI adapter shows "AI service temporarily unavailable" instead of raw JSON - Headless adapter emits service_unavailable error code * chore: formatting * fix: handle rate limit, network, and process exit errors with clear messages Extend error classification to cover additional failure modes: - 429/rate limit: "AI service is currently rate-limited" - ECONNREFUSED/ETIMEDOUT/ENOTFOUND: "Could not connect to the AI service" - Process exit: "AI agent process exited unexpectedly" Rate limits also abort validation retries (same as 500s). * fix: correct service-error regex and separate rate-limit handling P1: The adapter regex /service.unavailable/ only matched a single char between "service" and "unavailable", so it missed our own friendly message "The AI service is temporarily unavailable". Fixed to /service.*unavailable/. Also removed the "Agent SDK error:" prefix from all framework integrations so user-friendly messages pass through cleanly. P2: 429 rate limits were folded into SERVICE_UNAVAILABLE_PREFIX, which rewrote them to "temporarily unavailable" before adapters could see the rate-limit signal. Now 429s get a separate RATE_LIMITED_PREFIX with distinct messaging ("currently rate-limited"), while still aborting validation retries.
1 parent a81e71a commit 524c709

9 files changed

Lines changed: 211 additions & 9 deletions

File tree

src/integrations/dotnet/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ Begin integration now.`;
144144
if (agentResult.error) {
145145
await analytics.shutdown('error');
146146
const message = agentResult.errorMessage || agentResult.error;
147-
throw new Error(`Agent SDK error: ${message}`);
147+
throw new Error(message);
148148
}
149149

150150
// Post-installation validation

src/integrations/elixir/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ export async function run(options: InstallerOptions): Promise<string> {
124124
if (agentResult.error) {
125125
await analytics.shutdown('error');
126126
const message = agentResult.errorMessage || agentResult.error;
127-
throw new Error(`Agent SDK error: ${message}`);
127+
throw new Error(message);
128128
}
129129

130130
// Build summary

src/integrations/go/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ Begin integration now.`;
213213
if (agentResult.error) {
214214
await analytics.shutdown('error');
215215
const message = agentResult.errorMessage || agentResult.error;
216-
throw new Error(`Agent SDK error: ${message}`);
216+
throw new Error(message);
217217
}
218218

219219
// Post-installation validation (gracefully skips — no rules file for Go)

src/integrations/ruby/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ Begin integration now.`;
143143
if (agentResult.error) {
144144
await analytics.shutdown('error');
145145
const message = agentResult.errorMessage || agentResult.error;
146-
throw new Error(`Agent SDK error: ${message}`);
146+
throw new Error(message);
147147
}
148148

149149
// Build completion summary

src/lib/adapters/cli-adapter.ts

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,28 @@ export class CLIAdapter implements InstallerAdapter {
402402
this.stopSpinner('Error');
403403
this.stopAgentUpdates();
404404

405-
clack.log.error(message);
405+
// Rewrite raw API/SDK errors into user-friendly messages
406+
const isServiceError =
407+
/\b50[0-9]\b/.test(message) || /server_error|internal_error|overloaded|service.*unavailable/i.test(message);
408+
const isRateLimit = /\b429\b/.test(message) || /rate.limit/i.test(message);
409+
const isNetworkError = /ECONNREFUSED|ETIMEDOUT|ENOTFOUND|fetch failed/i.test(message);
410+
const isProcessExit = /process exited with code/i.test(message);
411+
412+
if (isServiceError) {
413+
clack.log.error('The AI service is temporarily unavailable.');
414+
clack.log.info('This is usually resolved within a few minutes. Please try again shortly.');
415+
} else if (isRateLimit) {
416+
clack.log.error('The AI service is currently rate-limited.');
417+
clack.log.info('Please wait a minute and try again.');
418+
} else if (isNetworkError) {
419+
clack.log.error('Could not connect to the AI service.');
420+
clack.log.info('Check your internet connection and try again.');
421+
} else if (isProcessExit) {
422+
clack.log.error('The AI agent process exited unexpectedly.');
423+
clack.log.info('Try running again. If this persists, run with --debug for details.');
424+
} else {
425+
clack.log.error(message);
426+
}
406427

407428
// Add actionable hints for common errors
408429
if (message.includes('authentication') || message.includes('auth')) {

src/lib/adapters/headless-adapter.ts

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,30 @@ export class HeadlessAdapter implements InstallerAdapter {
336336
};
337337

338338
private handleError = ({ message, stack }: InstallerEvents['error']): void => {
339-
writeNDJSON({ type: 'error', code: 'installer_error', message });
339+
const isServiceError =
340+
/\b50[0-9]\b/.test(message) || /server_error|internal_error|overloaded|service.*unavailable/i.test(message);
341+
const isRateLimit = /\b429\b/.test(message) || /rate.limit/i.test(message);
342+
const isNetworkError = /ECONNREFUSED|ETIMEDOUT|ENOTFOUND|fetch failed/i.test(message);
343+
const isProcessExit = /process exited with code/i.test(message);
344+
345+
let code = 'installer_error';
346+
let displayMessage = message;
347+
348+
if (isServiceError) {
349+
code = 'service_unavailable';
350+
displayMessage = 'The AI service is temporarily unavailable. Please try again in a few minutes.';
351+
} else if (isRateLimit) {
352+
code = 'rate_limited';
353+
displayMessage = 'The AI service is currently rate-limited. Please wait a minute and try again.';
354+
} else if (isNetworkError) {
355+
code = 'network_error';
356+
displayMessage = 'Could not connect to the AI service. Check your internet connection and try again.';
357+
} else if (isProcessExit) {
358+
code = 'process_error';
359+
displayMessage = 'The AI agent process exited unexpectedly. Try running again with --debug for details.';
360+
}
361+
362+
writeNDJSON({ type: 'error', code, message: displayMessage });
340363
this.debugLog(stack ?? '');
341364
};
342365
}

src/lib/agent-interface.spec.ts

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,21 @@ vi.mock('../utils/urls.js', () => ({
6767
getLlmGatewayUrlFromHost: vi.fn(() => 'http://localhost:8000'),
6868
}));
6969

70-
import { runAgent } from './agent-interface.js';
70+
import { runAgent, AgentErrorType } from './agent-interface.js';
7171
import { InstallerEventEmitter } from './events.js';
7272
import type { InstallerOptions } from '../utils/types.js';
7373

7474
/**
7575
* Create a mock SDK response that consumes the prompt stream and yields
7676
* responses for each prompt message. This models the real SDK behavior:
7777
* the response generator stays alive as long as prompts keep coming.
78+
*
79+
* Turn options:
80+
* - text: assistant text to yield
81+
* - error: result subtype is 'error' with errors array
82+
* - is_error: result has subtype 'success' but is_error: true (SDK exhausted retries)
7883
*/
79-
function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean }>) {
84+
function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean; is_error?: boolean }>) {
8085
return function mockQueryImpl({ prompt }: { prompt: AsyncIterable<unknown>; options: unknown }) {
8186
let turnIndex = 0;
8287

@@ -102,6 +107,7 @@ function createMockSDKResponse(turns: Array<{ text?: string; error?: boolean }>)
102107
yield {
103108
type: 'result',
104109
subtype: turn.error ? 'error' : 'success',
110+
is_error: turn.is_error ?? false,
105111
result: turn.text ?? '',
106112
...(turn.error ? { errors: ['Test error'] } : {}),
107113
};
@@ -265,3 +271,96 @@ describe('runAgent retry loop', () => {
265271
expect(validateAndFormat).toHaveBeenCalledTimes(1);
266272
});
267273
});
274+
275+
describe('service unavailability handling', () => {
276+
let emitter: InstallerEventEmitter;
277+
let emittedEvents: Array<{ event: string; payload: unknown }>;
278+
279+
beforeEach(() => {
280+
mockQuery.mockReset();
281+
emitter = new InstallerEventEmitter();
282+
emittedEvents = [];
283+
284+
const originalEmit = emitter.emit.bind(emitter);
285+
emitter.emit = ((event: string, payload: unknown) => {
286+
emittedEvents.push({ event, payload });
287+
return originalEmit(event, payload);
288+
}) as typeof emitter.emit;
289+
});
290+
291+
it('detects is_error result with API 500 as SERVICE_UNAVAILABLE', async () => {
292+
const apiErrorText = 'API Error: 500 {"error":{"type":"internal_error","message":"An unexpected error occurred"}}';
293+
mockQuery.mockImplementation(createMockSDKResponse([{ text: apiErrorText, is_error: true }]));
294+
295+
const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
296+
297+
expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
298+
expect(result.errorMessage).toMatch(/temporarily unavailable/);
299+
});
300+
301+
it('detects is_error result with server_error as SERVICE_UNAVAILABLE', async () => {
302+
mockQuery.mockImplementation(createMockSDKResponse([{ text: 'server_error: service overloaded', is_error: true }]));
303+
304+
const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
305+
306+
expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
307+
});
308+
309+
it('detects is_error result without service pattern as EXECUTION_ERROR', async () => {
310+
mockQuery.mockImplementation(createMockSDKResponse([{ text: 'Some other failure', is_error: true }]));
311+
312+
const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
313+
314+
expect(result.error).toBe(AgentErrorType.EXECUTION_ERROR);
315+
expect(result.errorMessage).toBe('Some other failure');
316+
});
317+
318+
it('skips validation retries when service is unavailable', async () => {
319+
const apiErrorText = 'API Error: 500 {"error":{"type":"internal_error","message":"An unexpected error occurred"}}';
320+
mockQuery.mockImplementation(createMockSDKResponse([{ text: apiErrorText, is_error: true }]));
321+
322+
const validateAndFormat = vi.fn().mockResolvedValue('Still broken');
323+
324+
const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
325+
maxRetries: 2,
326+
validateAndFormat,
327+
});
328+
329+
expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
330+
// validateAndFormat should never be called because retries are aborted
331+
expect(validateAndFormat).not.toHaveBeenCalled();
332+
333+
// No retry events should be emitted
334+
const retryEvents = emittedEvents.filter((e) => e.event === 'agent:retry');
335+
expect(retryEvents).toHaveLength(0);
336+
});
337+
338+
it('detects 429 rate limit as distinct from service unavailability', async () => {
339+
mockQuery.mockImplementation(
340+
createMockSDKResponse([{ text: 'API Error: 429 rate_limit_exceeded', is_error: true }]),
341+
);
342+
343+
const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter);
344+
345+
expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
346+
expect(result.errorMessage).toMatch(/rate-limited/);
347+
expect(result.errorMessage).not.toMatch(/temporarily unavailable/);
348+
});
349+
350+
it('skips validation retries when rate-limited', async () => {
351+
mockQuery.mockImplementation(
352+
createMockSDKResponse([{ text: 'API Error: 429 rate_limit_exceeded', is_error: true }]),
353+
);
354+
355+
const validateAndFormat = vi.fn().mockResolvedValue('Still broken');
356+
357+
const result = await runAgent(makeAgentConfig(), 'Test prompt', makeOptions(), undefined, emitter, {
358+
maxRetries: 2,
359+
validateAndFormat,
360+
});
361+
362+
expect(result.error).toBe(AgentErrorType.SERVICE_UNAVAILABLE);
363+
expect(result.errorMessage).toMatch(/rate-limited/);
364+
expect(validateAndFormat).not.toHaveBeenCalled();
365+
});
366+
});

src/lib/agent-interface.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ export const AgentSignals = {
5656

5757
export type AgentSignal = (typeof AgentSignals)[keyof typeof AgentSignals];
5858

59+
/** Internal prefix used to tag service-unavailability errors from handleSDKMessage */
60+
const SERVICE_UNAVAILABLE_PREFIX = '__SERVICE_UNAVAILABLE__';
61+
62+
/** Internal prefix used to tag rate-limit errors from handleSDKMessage */
63+
const RATE_LIMITED_PREFIX = '__RATE_LIMITED__';
64+
5965
/**
6066
* Error types that can be returned from agent execution.
6167
* These correspond to the error signals that the agent emits.
@@ -67,6 +73,8 @@ export enum AgentErrorType {
6773
RESOURCE_MISSING = 'INSTALLER_RESOURCE_MISSING',
6874
/** Agent execution failed (API error, auth error, etc.) */
6975
EXECUTION_ERROR = 'INSTALLER_EXECUTION_ERROR',
76+
/** AI service is unavailable (API 500, outage, etc.) */
77+
SERVICE_UNAVAILABLE = 'INSTALLER_SERVICE_UNAVAILABLE',
7078
}
7179

7280
export type AgentConfig = {
@@ -536,6 +544,11 @@ export async function runAgent(
536544
let resolveCurrentTurn!: () => void;
537545
let currentTurnDone!: Promise<void>;
538546

547+
// Set by the message loop when a fatal SDK error is detected (e.g. service
548+
// unavailability). The prompt stream checks this before yielding retry
549+
// prompts so we fail fast instead of burning minutes on hopeless retries.
550+
let abortRetries = false;
551+
539552
function resetTurnSignal() {
540553
currentTurnDone = new Promise<void>((resolve) => {
541554
resolveCurrentTurn = resolve;
@@ -555,6 +568,12 @@ export async function runAgent(
555568
while (retryCount < maxRetries) {
556569
await currentTurnDone;
557570

571+
// Don't send correction prompts when the service itself is down
572+
if (abortRetries) {
573+
logInfo('Skipping validation retries due to service error');
574+
break;
575+
}
576+
558577
emitter?.emit('validation:retry:start', { attempt: retryCount + 1 });
559578

560579
let validationPrompt: string | null;
@@ -628,6 +647,8 @@ export async function runAgent(
628647
const messageError = handleSDKMessage(message, options, collectedText, emitter);
629648
if (messageError) {
630649
sdkError = messageError;
650+
// Signal the prompt stream to stop yielding retry prompts
651+
abortRetries = true;
631652
}
632653
if (message.type === 'result') {
633654
resolveCurrentTurn();
@@ -645,6 +666,22 @@ export async function runAgent(
645666
// Check for SDK errors first (e.g., API errors, auth failures)
646667
// Return error type + message - caller decides whether to throw or emit events
647668
if (sdkError) {
669+
if (sdkError.startsWith(SERVICE_UNAVAILABLE_PREFIX)) {
670+
const detail = sdkError.slice(SERVICE_UNAVAILABLE_PREFIX.length);
671+
logError('AI service unavailable:', detail);
672+
return {
673+
error: AgentErrorType.SERVICE_UNAVAILABLE,
674+
errorMessage: 'The AI service is temporarily unavailable. Please try again in a few minutes.',
675+
};
676+
}
677+
if (sdkError.startsWith(RATE_LIMITED_PREFIX)) {
678+
const detail = sdkError.slice(RATE_LIMITED_PREFIX.length);
679+
logError('AI service rate-limited:', detail);
680+
return {
681+
error: AgentErrorType.SERVICE_UNAVAILABLE,
682+
errorMessage: 'The AI service is currently rate-limited. Please wait a minute and try again.',
683+
};
684+
}
648685
logError('Agent SDK error:', sdkError);
649686
return { error: AgentErrorType.EXECUTION_ERROR, errorMessage: sdkError };
650687
}
@@ -837,6 +874,26 @@ function handleSDKMessage(
837874
}
838875

839876
case 'result': {
877+
// The SDK may return subtype 'success' with is_error: true when API
878+
// retries are exhausted (e.g., persistent 500s). Check is_error first.
879+
const isResultError = (message as Record<string, unknown>).is_error === true;
880+
881+
if (isResultError) {
882+
const resultText = typeof message.result === 'string' ? message.result : '';
883+
logError('Agent result marked as error:', resultText);
884+
885+
// Detect rate limiting (429) — check before 5xx so it gets distinct messaging
886+
if (/\b429\b/.test(resultText) || /rate.limit/i.test(resultText)) {
887+
return `${RATE_LIMITED_PREFIX}${resultText}`;
888+
}
889+
890+
// Detect service unavailability (API 500, upstream outage)
891+
if (/\b50[0-9]\b/.test(resultText) || /server_error|internal_error|overloaded/.test(resultText)) {
892+
return `${SERVICE_UNAVAILABLE_PREFIX}${resultText}`;
893+
}
894+
return resultText || 'Agent execution failed';
895+
}
896+
840897
if (message.subtype === 'success') {
841898
logInfo('Agent completed successfully');
842899
if (typeof message.result === 'string') {

src/lib/agent-runner.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,9 @@ export async function runAgentInstaller(config: FrameworkConfig, options: Instal
139139
if (agentResult.error) {
140140
await analytics.shutdown('error');
141141
const message = agentResult.errorMessage || agentResult.error;
142-
throw new Error(`Agent SDK error: ${message}`);
142+
// Pass user-friendly messages through without wrapping them in
143+
// "Agent SDK error:" — that prefix obscures the actionable text.
144+
throw new Error(message);
143145
}
144146

145147
// Track retry metrics

0 commit comments

Comments
 (0)