Files
Genarrative/scripts/ops/production-health-patrol.mjs
kdletters 9db467d23f 补充 release SpacetimeDB 健康检查与巡检防回退
增加 SpacetimeDB 阶段化健康检查与 /readyz 阶段输出
记录 procedure/reducer/read 失败的阶段和耗时
补充 release 健康巡检 systemd timer 与生产 ops 预检
同步 API 构建部署、provision 脚本和运维文档
2026-06-10 11:35:39 +08:00

478 lines
12 KiB
JavaScript

#!/usr/bin/env node
import {execFile} from 'node:child_process';
import http from 'node:http';
import https from 'node:https';
import {mkdir, writeFile} from 'node:fs/promises';
import {dirname} from 'node:path';
const STATUS_RANK = {
OK: 0,
WARNING: 1,
CRITICAL: 2,
};
const DEFAULT_PUBLIC_PATHS = [
'/api/creation-entry/config',
'/api/runtime/puzzle/gallery',
'/api/runtime/custom-world-gallery',
];
const DEFAULT_SERVICES = [
'genarrative-api.service',
'spacetimedb.service',
'nginx.service',
];
function usage() {
console.log(`Usage:
node scripts/ops/production-health-patrol.mjs [options]
Options:
--api-base-url <url> API direct base URL, default http://127.0.0.1:8082
--spacetime-base-url <url> SpacetimeDB base URL, default http://127.0.0.1:3101
--public-base-url <url> Nginx/public base URL, default http://127.0.0.1
--public-path <path> Public API path to probe; repeatable
--status-file <path> Write the last patrol result as JSON
--timeout-ms <ms> HTTP/command timeout, default 5000
--slow-ms <ms> Mark successful probes slower than this as WARNING, default 3000
--fail-on-warning Exit 1 when the total status is WARNING
--skip-journal Skip recent journal error scan
--json Print JSON instead of text
`);
}
function readBoolEnv(name, fallback = false) {
const value = process.env[name];
if (!value) {
return fallback;
}
return ['1', 'true', 'yes', 'on'].includes(value.trim().toLowerCase());
}
function parsePositiveInt(raw, fallback) {
const value = Number.parseInt(String(raw ?? ''), 10);
return Number.isFinite(value) && value > 0 ? value : fallback;
}
function parseArgs(argv) {
const config = {
apiBaseUrl:
process.env.GENARRATIVE_HEALTH_PATROL_API_BASE_URL ||
'http://127.0.0.1:8082',
spacetimeBaseUrl:
process.env.GENARRATIVE_HEALTH_PATROL_SPACETIME_BASE_URL ||
'http://127.0.0.1:3101',
publicBaseUrl:
process.env.GENARRATIVE_HEALTH_PATROL_PUBLIC_BASE_URL ||
process.env.GENARRATIVE_HEALTH_PATROL_API_BASE_URL ||
'http://127.0.0.1:8082',
publicPaths: [],
statusFile: process.env.GENARRATIVE_HEALTH_PATROL_STATUS_FILE || '',
timeoutMs: parsePositiveInt(
process.env.GENARRATIVE_HEALTH_PATROL_TIMEOUT_MS,
5000,
),
slowMs: parsePositiveInt(
process.env.GENARRATIVE_HEALTH_PATROL_SLOW_MS,
3000,
),
failOnWarning: readBoolEnv('GENARRATIVE_HEALTH_PATROL_FAIL_ON_WARNING'),
skipJournal: readBoolEnv('GENARRATIVE_HEALTH_PATROL_SKIP_JOURNAL'),
json: false,
webhookUrl: process.env.GENARRATIVE_HEALTH_PATROL_WEBHOOK_URL || '',
};
for (let index = 0; index < argv.length; index += 1) {
const arg = argv[index];
switch (arg) {
case '-h':
case '--help':
usage();
process.exit(0);
break;
case '--api-base-url':
config.apiBaseUrl = requireValue(argv, ++index, arg);
break;
case '--spacetime-base-url':
config.spacetimeBaseUrl = requireValue(argv, ++index, arg);
break;
case '--public-base-url':
config.publicBaseUrl = requireValue(argv, ++index, arg);
break;
case '--public-path':
config.publicPaths.push(requireValue(argv, ++index, arg));
break;
case '--status-file':
config.statusFile = requireValue(argv, ++index, arg);
break;
case '--timeout-ms':
config.timeoutMs = parsePositiveInt(requireValue(argv, ++index, arg), 5000);
break;
case '--slow-ms':
config.slowMs = parsePositiveInt(requireValue(argv, ++index, arg), 3000);
break;
case '--fail-on-warning':
config.failOnWarning = true;
break;
case '--skip-journal':
config.skipJournal = true;
break;
case '--json':
config.json = true;
break;
default:
throw new Error(`未知参数: ${arg}`);
}
}
if (config.publicPaths.length === 0) {
config.publicPaths = DEFAULT_PUBLIC_PATHS;
}
return config;
}
function requireValue(argv, index, flag) {
const value = argv[index];
if (!value || value.startsWith('--')) {
throw new Error(`${flag} 缺少参数值`);
}
return value;
}
function joinUrl(baseUrl, path) {
const base = baseUrl.endsWith('/') ? baseUrl.slice(0, -1) : baseUrl;
const suffix = path.startsWith('/') ? path : `/${path}`;
return `${base}${suffix}`;
}
function maxStatus(checks) {
return checks.reduce((current, check) => {
return STATUS_RANK[check.status] > STATUS_RANK[current] ? check.status : current;
}, 'OK');
}
function checkResult(name, status, summary, details = {}) {
return {
name,
status,
summary,
...details,
};
}
function runCommand(command, args, timeoutMs) {
return new Promise((resolve) => {
execFile(
command,
args,
{
timeout: timeoutMs,
windowsHide: true,
maxBuffer: 256 * 1024,
},
(error, stdout, stderr) => {
resolve({
command: [command, ...args].join(' '),
code:
typeof error?.code === 'number'
? error.code
: error
? 1
: 0,
signal: error?.signal || '',
stdout: String(stdout || ''),
stderr: String(stderr || ''),
timedOut: Boolean(error?.killed),
error: error ? error.message : '',
});
},
);
});
}
async function checkService(serviceName, timeoutMs) {
const result = await runCommand(
'systemctl',
['is-active', serviceName],
timeoutMs,
);
const state = result.stdout.trim() || result.stderr.trim() || result.error;
if (result.code === 0 && state === 'active') {
return checkResult(`service:${serviceName}`, 'OK', 'active', {
command: result.command,
});
}
return checkResult(
`service:${serviceName}`,
'CRITICAL',
`服务状态异常: ${state || `exit ${result.code}`}`,
{
command: result.command,
stderr: result.stderr.trim(),
},
);
}
function requestUrl(url, timeoutMs) {
return new Promise((resolve) => {
const startedAt = Date.now();
const parsed = new URL(url);
const client = parsed.protocol === 'https:' ? https : http;
const request = client.request(
parsed,
{
method: 'GET',
timeout: timeoutMs,
headers: {
'User-Agent': 'genarrative-health-patrol/1.0',
Accept: 'application/json,text/plain,*/*',
},
},
(response) => {
let body = '';
response.setEncoding('utf8');
response.on('data', (chunk) => {
if (body.length < 2048) {
body += chunk;
}
});
response.on('end', () => {
resolve({
elapsedMs: Date.now() - startedAt,
statusCode: response.statusCode || 0,
body: body.slice(0, 2048),
});
});
},
);
request.on('timeout', () => {
request.destroy(new Error(`timeout after ${timeoutMs}ms`));
});
request.on('error', (error) => {
resolve({
elapsedMs: Date.now() - startedAt,
error: error.message,
});
});
request.end();
});
}
async function checkHttp(name, url, config) {
const result = await requestUrl(url, config.timeoutMs);
const curlCommand = `curl -fsS --max-time ${Math.ceil(config.timeoutMs / 1000)} ${url}`;
if (result.error) {
return checkResult(name, 'CRITICAL', `请求失败: ${result.error}`, {
command: curlCommand,
elapsedMs: result.elapsedMs,
});
}
const ok = result.statusCode >= 200 && result.statusCode < 300;
if (!ok) {
return checkResult(
name,
'CRITICAL',
`HTTP ${result.statusCode},耗时 ${result.elapsedMs}ms`,
{
command: curlCommand,
elapsedMs: result.elapsedMs,
body: result.body.trim(),
},
);
}
if (result.elapsedMs > config.slowMs) {
return checkResult(
name,
'WARNING',
`HTTP ${result.statusCode} 但耗时偏高: ${result.elapsedMs}ms`,
{
command: curlCommand,
elapsedMs: result.elapsedMs,
},
);
}
return checkResult(name, 'OK', `HTTP ${result.statusCode} ${result.elapsedMs}ms`, {
command: curlCommand,
elapsedMs: result.elapsedMs,
});
}
async function checkRecentJournal(config) {
const args = [
'-u',
'genarrative-api.service',
'-u',
'spacetimedb.service',
'-u',
'nginx.service',
'--since',
'15 minutes ago',
'-p',
'err..alert',
'--no-pager',
'-o',
'short-iso',
'-n',
'20',
];
const result = await runCommand('journalctl', args, config.timeoutMs);
if (result.code !== 0) {
return checkResult('journal:recent-errors', 'WARNING', '无法读取最近错误日志', {
command: result.command,
stderr: result.stderr.trim() || result.error,
});
}
const lines = result.stdout
.split('\n')
.map((line) => line.trim())
.filter((line) => line && line !== '-- No entries --');
if (lines.length === 0) {
return checkResult('journal:recent-errors', 'OK', '最近 15 分钟无 err..alert 日志', {
command: result.command,
});
}
return checkResult(
'journal:recent-errors',
'WARNING',
`最近 15 分钟有 ${lines.length} 条 err..alert 日志`,
{
command: result.command,
lines,
},
);
}
async function writeStatusFile(statusFile, payload) {
if (!statusFile) {
return;
}
await mkdir(dirname(statusFile), {recursive: true});
await writeFile(statusFile, `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
}
async function notifyWebhook(config, payload) {
if (!config.webhookUrl || payload.status === 'OK') {
return;
}
const body = JSON.stringify(payload);
const parsed = new URL(config.webhookUrl);
const client = parsed.protocol === 'https:' ? https : http;
await new Promise((resolve) => {
const request = client.request(
parsed,
{
method: 'POST',
timeout: config.timeoutMs,
headers: {
'Content-Type': 'application/json',
'Content-Length': Buffer.byteLength(body),
},
},
(response) => {
response.resume();
response.on('end', resolve);
},
);
request.on('timeout', () => {
request.destroy(new Error(`timeout after ${config.timeoutMs}ms`));
});
request.on('error', (error) => {
console.error(`[health-patrol] webhook notify failed: ${error.message}`);
resolve();
});
request.end(body);
});
}
function printText(payload) {
console.log(`[health-patrol] ${payload.status} ${payload.checkedAt}`);
for (const check of payload.checks) {
console.log(`[${check.status}] ${check.name}: ${check.summary}`);
if (check.command && check.status !== 'OK') {
console.log(` command: ${check.command}`);
}
if (check.stderr) {
console.log(` stderr: ${check.stderr}`);
}
if (check.body) {
console.log(` body: ${check.body}`);
}
if (Array.isArray(check.lines) && check.lines.length > 0) {
for (const line of check.lines) {
console.log(` ${line}`);
}
}
}
}
async function main() {
const config = parseArgs(process.argv.slice(2));
const checks = [];
for (const serviceName of DEFAULT_SERVICES) {
checks.push(await checkService(serviceName, config.timeoutMs));
}
checks.push(await checkHttp('api:/healthz', joinUrl(config.apiBaseUrl, '/healthz'), config));
checks.push(await checkHttp('api:/readyz', joinUrl(config.apiBaseUrl, '/readyz'), config));
checks.push(
await checkHttp(
'spacetimedb:/v1/ping',
joinUrl(config.spacetimeBaseUrl, '/v1/ping'),
config,
),
);
for (const path of config.publicPaths) {
checks.push(
await checkHttp(`public:${path}`, joinUrl(config.publicBaseUrl, path), config),
);
}
if (!config.skipJournal) {
checks.push(await checkRecentJournal(config));
}
const payload = {
status: maxStatus(checks),
checkedAt: new Date().toISOString(),
host: process.env.HOSTNAME || '',
checks,
};
await writeStatusFile(config.statusFile, payload);
await notifyWebhook(config, payload);
if (config.json) {
console.log(JSON.stringify(payload, null, 2));
} else {
printText(payload);
}
if (payload.status === 'CRITICAL') {
process.exit(2);
}
if (payload.status === 'WARNING' && config.failOnWarning) {
process.exit(1);
}
}
main().catch((error) => {
console.error(`[health-patrol] CRITICAL ${error instanceof Error ? error.message : String(error)}`);
process.exit(2);
});