From ea3b90aca2814b8f7e7bbdfef2235804c45f2338 Mon Sep 17 00:00:00 2001 From: Elijah Duffy Date: Wed, 24 Dec 2025 18:47:03 -0800 Subject: [PATCH] node: redesign readiness check system & improve event loop lag check --- node-health/src/checks.ts | 50 +++++++++++++++++---------- node-health/src/readiness.ts | 66 +++++++++++++++++++++++++++++++----- 2 files changed, 89 insertions(+), 27 deletions(-) diff --git a/node-health/src/checks.ts b/node-health/src/checks.ts index 084b853..998c237 100644 --- a/node-health/src/checks.ts +++ b/node-health/src/checks.ts @@ -1,26 +1,40 @@ -import { IntervalHistogram, monitorEventLoopDelay } from 'node:perf_hooks'; -import { ReadinessDetail, ReadinessFunction, ReadinessStatus } from './readiness'; - -let hist: IntervalHistogram | null = null; +import { monitorEventLoopDelay } from 'node:perf_hooks'; +import { ReadinessCheck, ReadinessFunctionReturn, ReadinessStatus } from './readiness'; /** * Builds a readiness check function that monitors event loop lag. - * @param degradedMs - The threshold in milliseconds above which the status is 'degraded' (default 200 ms). + * @param options - Configuration options for the event loop lag check. + * @param options.degradedMs - Threshold in milliseconds for degraded status (default: 200). + * @param options.failMs - Threshold in milliseconds for error status (default: 1000). + * @param options.histResetMs - Interval in milliseconds to reset the histogram (default: 60000). + * @param options.percentile - Percentile to monitor (default: 50). * @returns A ReadinessFunction that checks event loop lag. */ -export const buildEventLoopLagCheck = (degradedMs: number = 200): ReadinessFunction => { - if (!hist) { - hist = monitorEventLoopDelay({ resolution: 10 }); - hist.enable(); - } +export const buildEventLoopLagCheck = (options: { + degradedMs?: number; + failMs?: number; + histResetMs?: number; + percentile?: number; +}): ReadinessCheck => { + const { degradedMs = 200, failMs = 1000, histResetMs = 60000, percentile = 50 } = options; - return (): ReadinessDetail => { - const lag = hist!.mean / 1e6; // Convert from nanoseconds to milliseconds - const status: ReadinessStatus = lag < degradedMs ? 'ok' : 'degraded'; - return { - name: 'event-loop-lag', - status, - message: `Event loop lag is ${lag.toFixed(2)} ms`, - }; + const hist = monitorEventLoopDelay({ resolution: 10 }); + hist.enable(); + + setInterval(() => { + hist.reset(); + }, histResetMs).unref(); + + return { + name: 'event-loop-lag', + fn: async (): Promise => { + const lag = hist.percentile(percentile) / 1e6; // Convert from nanoseconds to milliseconds + const status: ReadinessStatus = lag < degradedMs ? 'ok' : lag < failMs ? 'degraded' : 'error'; + return { + status, + message: `Event loop lag is ${lag.toFixed(2)} ms`, + }; + }, + timeout: 500, }; }; diff --git a/node-health/src/readiness.ts b/node-health/src/readiness.ts index 9f55399..fe222ce 100644 --- a/node-health/src/readiness.ts +++ b/node-health/src/readiness.ts @@ -1,5 +1,13 @@ +/** Return type of a readiness check function */ +export type ReadinessFunctionReturn = { + /** Status of the readiness check */ + status: ReadinessStatus; + /** Optional message providing additional information about the readiness check */ + message?: string; +}; + /** Function that performs a readiness check */ -export type ReadinessFunction = () => Promise | ReadinessDetail; +export type ReadinessFunction = (check: ReadinessCheck) => Promise; /** Status of a readiness check */ export type ReadinessStatus = 'ok' | 'error' | 'degraded'; @@ -10,6 +18,16 @@ const aggregateStatus = (statuses: ReadinessStatus[]): ReadinessStatus => { return 'ok'; }; +/** Represents a readiness check with an optional timeout */ +export type ReadinessCheck = { + /** Name of the readiness check */ + name: string; + /** Function that performs the readiness check */ + fn: ReadinessFunction; + /** Timeout in milliseconds for the readiness check (default: 5000) */ + timeout?: number; +}; + /** Result of a system readiness check */ export type ReadinessResult = { /** @@ -32,7 +50,9 @@ export type ReadinessDetail = { name: string; /** Status of the readiness check */ status: ReadinessStatus; - /** Optional message providing additional information about the readiness check */ + /** Duration of the readiness check in milliseconds */ + duration: number; + /** Message providing additional information about the readiness check */ message?: string; }; @@ -41,20 +61,31 @@ export type ReadinessDetail = { * @param checks - An array of readiness functions to execute. * @returns A Promise that resolves to a ReadinessResult object. */ -export const readiness = async (checks: ReadinessFunction[]): Promise => { +export const readiness = async (checks: ReadinessCheck[]): Promise => { const start = Date.now(); const t0 = performance.now(); const details: ReadinessDetail[] = []; for (const check of checks) { + const checkt0 = performance.now(); try { - const result = await Promise.resolve(check()); - details.push(result); + const result = await withTimeout( + check.fn(check), + check.timeout ?? 5000, + `Readiness check '${check.name}' timed out after ${check.timeout ?? 5000} ms`, + ); + details.push({ + name: check.name, + status: result.status, + message: result.message, + duration: performance.now() - checkt0, + }); } catch (err) { details.push({ - name: 'unknown', + name: check.name, status: 'error', message: err instanceof Error ? err.message : String(err), + duration: performance.now() - checkt0, }); } } @@ -75,7 +106,7 @@ export const readiness = async (checks: ReadinessFunction[]): Promise Promise) => { +export const createReadinessHandler = (checks: ReadinessCheck[]): (() => Promise) => { return async () => { const result = await readiness(checks); return respondWithResult(result); @@ -86,7 +117,7 @@ export const createReadinessHandler = (checks: ReadinessFunction[]): (() => Prom * Class that schedules periodic readiness checks. */ export class ScheduledReadiness { - private checks: ReadinessFunction[]; + private checks: ReadinessCheck[]; private interval: number; private started: boolean = false; private timer: NodeJS.Timeout | null = null; @@ -98,7 +129,7 @@ export class ScheduledReadiness { * @param checks - An array of readiness functions to execute. * @param interval - Interval in milliseconds between readiness checks. */ - constructor(checks: ReadinessFunction[], interval: number) { + constructor(checks: ReadinessCheck[], interval: number) { this.checks = checks; this.interval = interval; } @@ -201,3 +232,20 @@ const httpStatusFromReadiness = (status: ReadinessStatus | 'unknown'): number => if (status === 'error') return 503; return 200; // unknown, treat as ok to avoid false alarms }; + +const withTimeout = async ( + promise: Promise, + ms: number, + timeoutMessage: string, +): Promise => { + let timeoutHandle: NodeJS.Timeout; + const timeoutPromise = new Promise((_, reject) => { + timeoutHandle = setTimeout(() => { + reject(new Error(timeoutMessage)); + }, ms); + }); + + return Promise.race([promise, timeoutPromise]).finally(() => { + clearTimeout(timeoutHandle); + }); +};