node: redesign readiness check system & improve event loop lag check

This commit is contained in:
Elijah Duffy
2025-12-24 18:47:03 -08:00
parent a6d9d72322
commit ea3b90aca2
2 changed files with 89 additions and 27 deletions

View File

@@ -1,26 +1,40 @@
import { IntervalHistogram, monitorEventLoopDelay } from 'node:perf_hooks'; import { monitorEventLoopDelay } from 'node:perf_hooks';
import { ReadinessDetail, ReadinessFunction, ReadinessStatus } from './readiness'; import { ReadinessCheck, ReadinessFunctionReturn, ReadinessStatus } from './readiness';
let hist: IntervalHistogram | null = null;
/** /**
* Builds a readiness check function that monitors event loop lag. * Builds a readiness check function that monitors event loop lag.
* @param degradedMs - The threshold in milliseconds above which the status is 'degraded' (default 200 ms). * @param options - Configuration options for the event loop lag check.
* @param options.degradedMs - Threshold in milliseconds for degraded status (default: 200).
* @param options.failMs - Threshold in milliseconds for error status (default: 1000).
* @param options.histResetMs - Interval in milliseconds to reset the histogram (default: 60000).
* @param options.percentile - Percentile to monitor (default: 50).
* @returns A ReadinessFunction that checks event loop lag. * @returns A ReadinessFunction that checks event loop lag.
*/ */
export const buildEventLoopLagCheck = (degradedMs: number = 200): ReadinessFunction => { export const buildEventLoopLagCheck = (options: {
if (!hist) { degradedMs?: number;
hist = monitorEventLoopDelay({ resolution: 10 }); failMs?: number;
hist.enable(); histResetMs?: number;
} percentile?: number;
}): ReadinessCheck => {
const { degradedMs = 200, failMs = 1000, histResetMs = 60000, percentile = 50 } = options;
const hist = monitorEventLoopDelay({ resolution: 10 });
hist.enable();
setInterval(() => {
hist.reset();
}, histResetMs).unref();
return (): ReadinessDetail => {
const lag = hist!.mean / 1e6; // Convert from nanoseconds to milliseconds
const status: ReadinessStatus = lag < degradedMs ? 'ok' : 'degraded';
return { return {
name: 'event-loop-lag', name: 'event-loop-lag',
fn: async (): Promise<ReadinessFunctionReturn> => {
const lag = hist.percentile(percentile) / 1e6; // Convert from nanoseconds to milliseconds
const status: ReadinessStatus = lag < degradedMs ? 'ok' : lag < failMs ? 'degraded' : 'error';
return {
status, status,
message: `Event loop lag is ${lag.toFixed(2)} ms`, message: `Event loop lag is ${lag.toFixed(2)} ms`,
}; };
},
timeout: 500,
}; };
}; };

View File

@@ -1,5 +1,13 @@
/** Return type of a readiness check function */
export type ReadinessFunctionReturn = {
/** Status of the readiness check */
status: ReadinessStatus;
/** Optional message providing additional information about the readiness check */
message?: string;
};
/** Function that performs a readiness check */ /** Function that performs a readiness check */
export type ReadinessFunction = () => Promise<ReadinessDetail> | ReadinessDetail; export type ReadinessFunction = (check: ReadinessCheck) => Promise<ReadinessFunctionReturn>;
/** Status of a readiness check */ /** Status of a readiness check */
export type ReadinessStatus = 'ok' | 'error' | 'degraded'; export type ReadinessStatus = 'ok' | 'error' | 'degraded';
@@ -10,6 +18,16 @@ const aggregateStatus = (statuses: ReadinessStatus[]): ReadinessStatus => {
return 'ok'; return 'ok';
}; };
/** Represents a readiness check with an optional timeout */
export type ReadinessCheck = {
/** Name of the readiness check */
name: string;
/** Function that performs the readiness check */
fn: ReadinessFunction;
/** Timeout in milliseconds for the readiness check (default: 5000) */
timeout?: number;
};
/** Result of a system readiness check */ /** Result of a system readiness check */
export type ReadinessResult = { export type ReadinessResult = {
/** /**
@@ -32,7 +50,9 @@ export type ReadinessDetail = {
name: string; name: string;
/** Status of the readiness check */ /** Status of the readiness check */
status: ReadinessStatus; status: ReadinessStatus;
/** Optional message providing additional information about the readiness check */ /** Duration of the readiness check in milliseconds */
duration: number;
/** Message providing additional information about the readiness check */
message?: string; message?: string;
}; };
@@ -41,20 +61,31 @@ export type ReadinessDetail = {
* @param checks - An array of readiness functions to execute. * @param checks - An array of readiness functions to execute.
* @returns A Promise that resolves to a ReadinessResult object. * @returns A Promise that resolves to a ReadinessResult object.
*/ */
export const readiness = async (checks: ReadinessFunction[]): Promise<ReadinessResult> => { export const readiness = async (checks: ReadinessCheck[]): Promise<ReadinessResult> => {
const start = Date.now(); const start = Date.now();
const t0 = performance.now(); const t0 = performance.now();
const details: ReadinessDetail[] = []; const details: ReadinessDetail[] = [];
for (const check of checks) { for (const check of checks) {
const checkt0 = performance.now();
try { try {
const result = await Promise.resolve(check()); const result = await withTimeout(
details.push(result); check.fn(check),
check.timeout ?? 5000,
`Readiness check '${check.name}' timed out after ${check.timeout ?? 5000} ms`,
);
details.push({
name: check.name,
status: result.status,
message: result.message,
duration: performance.now() - checkt0,
});
} catch (err) { } catch (err) {
details.push({ details.push({
name: 'unknown', name: check.name,
status: 'error', status: 'error',
message: err instanceof Error ? err.message : String(err), message: err instanceof Error ? err.message : String(err),
duration: performance.now() - checkt0,
}); });
} }
} }
@@ -75,7 +106,7 @@ export const readiness = async (checks: ReadinessFunction[]): Promise<ReadinessR
* @param checks - An array of readiness functions to execute. * @param checks - An array of readiness functions to execute.
* @returns A function that returns a Response object with ReadinessResult in JSON format. * @returns A function that returns a Response object with ReadinessResult in JSON format.
*/ */
export const createReadinessHandler = (checks: ReadinessFunction[]): (() => Promise<Response>) => { export const createReadinessHandler = (checks: ReadinessCheck[]): (() => Promise<Response>) => {
return async () => { return async () => {
const result = await readiness(checks); const result = await readiness(checks);
return respondWithResult(result); return respondWithResult(result);
@@ -86,7 +117,7 @@ export const createReadinessHandler = (checks: ReadinessFunction[]): (() => Prom
* Class that schedules periodic readiness checks. * Class that schedules periodic readiness checks.
*/ */
export class ScheduledReadiness { export class ScheduledReadiness {
private checks: ReadinessFunction[]; private checks: ReadinessCheck[];
private interval: number; private interval: number;
private started: boolean = false; private started: boolean = false;
private timer: NodeJS.Timeout | null = null; private timer: NodeJS.Timeout | null = null;
@@ -98,7 +129,7 @@ export class ScheduledReadiness {
* @param checks - An array of readiness functions to execute. * @param checks - An array of readiness functions to execute.
* @param interval - Interval in milliseconds between readiness checks. * @param interval - Interval in milliseconds between readiness checks.
*/ */
constructor(checks: ReadinessFunction[], interval: number) { constructor(checks: ReadinessCheck[], interval: number) {
this.checks = checks; this.checks = checks;
this.interval = interval; this.interval = interval;
} }
@@ -201,3 +232,20 @@ const httpStatusFromReadiness = (status: ReadinessStatus | 'unknown'): number =>
if (status === 'error') return 503; if (status === 'error') return 503;
return 200; // unknown, treat as ok to avoid false alarms return 200; // unknown, treat as ok to avoid false alarms
}; };
const withTimeout = async <T>(
promise: Promise<T>,
ms: number,
timeoutMessage: string,
): Promise<T> => {
let timeoutHandle: NodeJS.Timeout;
const timeoutPromise = new Promise<never>((_, reject) => {
timeoutHandle = setTimeout(() => {
reject(new Error(timeoutMessage));
}, ms);
});
return Promise.race([promise, timeoutPromise]).finally(() => {
clearTimeout(timeoutHandle);
});
};