补充 release SpacetimeDB 健康检查与巡检防回退
增加 SpacetimeDB 阶段化健康检查与 /readyz 阶段输出 记录 procedure/reducer/read 失败的阶段和耗时 补充 release 健康巡检 systemd timer 与生产 ops 预检 同步 API 构建部署、provision 脚本和运维文档
This commit is contained in:
@@ -445,7 +445,7 @@ if [[ "${BUILD_SPACETIME}" -eq 1 ]]; then
|
||||
write_migration_bootstrap_secret_file
|
||||
fi
|
||||
|
||||
mkdir -p "${TARGET_DIR}/scripts" "${TARGET_DIR}/deploy"
|
||||
mkdir -p "${TARGET_DIR}/scripts" "${TARGET_DIR}/scripts/ops" "${TARGET_DIR}/deploy"
|
||||
cp "${SCRIPT_DIR}/deploy/maintenance-on.sh" "${TARGET_DIR}/scripts/maintenance-on.sh"
|
||||
cp "${SCRIPT_DIR}/deploy/maintenance-off.sh" "${TARGET_DIR}/scripts/maintenance-off.sh"
|
||||
cp "${SCRIPT_DIR}/deploy/maintenance-status.sh" "${TARGET_DIR}/scripts/maintenance-status.sh"
|
||||
@@ -466,6 +466,7 @@ copy_required_file "${SCRIPT_DIR}/spacetime-migration-common.mjs" "${TARGET_DIR}
|
||||
copy_required_file "${SCRIPT_DIR}/spacetime-authorize-migration-operator.mjs" "${TARGET_DIR}/scripts/spacetime-authorize-migration-operator.mjs" "数据库迁移授权脚本"
|
||||
copy_required_file "${SCRIPT_DIR}/spacetime-revoke-migration-operator.mjs" "${TARGET_DIR}/scripts/spacetime-revoke-migration-operator.mjs" "数据库迁移撤权脚本"
|
||||
copy_required_file "${SCRIPT_DIR}/database-backup-to-oss.mjs" "${TARGET_DIR}/scripts/database-backup-to-oss.mjs" "数据库 OSS 备份脚本"
|
||||
copy_required_file "${SCRIPT_DIR}/ops/production-health-patrol.mjs" "${TARGET_DIR}/scripts/ops/production-health-patrol.mjs" "生产健康巡检脚本"
|
||||
|
||||
copy_required_dir "${REPO_ROOT}/deploy/systemd" "${TARGET_DIR}/deploy/systemd" "systemd 配置"
|
||||
copy_required_dir "${REPO_ROOT}/deploy/nginx" "${TARGET_DIR}/deploy/nginx" "Nginx 配置"
|
||||
@@ -485,7 +486,7 @@ cat >"${TARGET_DIR}/README.md" <<EOF
|
||||
- \`migration-bootstrap-secret.txt\`:构建 \`spacetime_module.wasm\` 时注入的迁移引导密钥,仅用于创建首个迁移操作员;请作为敏感文件保存到 Jenkins Secret Text,授权完成后不要长期留在公开归档中。
|
||||
- \`*.sha256\`:发布产物 checksum,用于部署前校验。
|
||||
- \`release-manifest.json\`:发布版本、源码 commit 与产物清单。
|
||||
- \`scripts/\`:维护模式脚本、数据库导入导出脚本、数据库 OSS 备份脚本、迁移授权脚本和 Jenkins inbound agent systemd 安装脚本。
|
||||
- \`scripts/\`:维护模式脚本、数据库导入导出脚本、数据库 OSS 备份脚本、生产健康巡检脚本、迁移授权脚本和 Jenkins inbound agent systemd 安装脚本。
|
||||
- \`deploy/\`:systemd、Nginx 和生产环境变量示例;\`deploy/nginx/genarrative-dev-http.conf\` 仅供无域名开发服初始化使用。
|
||||
|
||||
## 生产部署口径
|
||||
|
||||
64
scripts/check-production-ops-guardrails.mjs
Normal file
64
scripts/check-production-ops-guardrails.mjs
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import {readFileSync} from 'node:fs';
|
||||
|
||||
const checks = [
|
||||
{
|
||||
file: 'deploy/systemd/genarrative-database-backup.service',
|
||||
includes: '--restart-service-after genarrative-api.service',
|
||||
reason: '生产冷备份恢复 SpacetimeDB 后必须显式拉起依赖它的 API 服务。',
|
||||
},
|
||||
{
|
||||
file: 'deploy/systemd/genarrative-health-patrol.service',
|
||||
includes: 'scripts/ops/production-health-patrol.mjs',
|
||||
reason: '健康巡检 systemd service 必须调用随 API release 发布的巡检脚本。',
|
||||
},
|
||||
{
|
||||
file: 'deploy/systemd/genarrative-health-patrol.timer',
|
||||
includes: 'genarrative-health-patrol.service',
|
||||
reason: '健康巡检 timer 必须绑定巡检 service。',
|
||||
},
|
||||
{
|
||||
file: 'scripts/jenkins-server-provision.sh',
|
||||
includes: 'genarrative-health-patrol.timer',
|
||||
reason: 'Server-Provision 必须安装并启用健康巡检 timer。',
|
||||
},
|
||||
{
|
||||
file: 'scripts/build-production-release.sh',
|
||||
includes: 'production-health-patrol.mjs',
|
||||
reason: '生产 API release 必须携带健康巡检脚本。',
|
||||
},
|
||||
{
|
||||
file: 'scripts/deploy/production-api-deploy.sh',
|
||||
includes: 'production-health-patrol.mjs',
|
||||
reason: 'API deploy 必须把健康巡检脚本复制到 current release。',
|
||||
},
|
||||
{
|
||||
file: 'jenkins/Jenkinsfile.production-api-build',
|
||||
includes: 'scripts/ops/production-health-patrol.mjs',
|
||||
reason: 'API Build 归档必须包含健康巡检脚本。',
|
||||
},
|
||||
{
|
||||
file: 'jenkins/Jenkinsfile.production-api-deploy',
|
||||
includes: 'scripts/ops/production-health-patrol.mjs',
|
||||
reason: 'API Deploy 复制上游产物时必须包含健康巡检脚本。',
|
||||
},
|
||||
];
|
||||
|
||||
let failed = false;
|
||||
|
||||
for (const check of checks) {
|
||||
const content = readFileSync(check.file, 'utf8');
|
||||
if (!content.includes(check.includes)) {
|
||||
failed = true;
|
||||
console.error(
|
||||
`[check:production-ops] ${check.file} 缺少 ${check.includes}。${check.reason}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (failed) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('[check:production-ops] OK');
|
||||
@@ -334,7 +334,9 @@ chmod +x "${RELEASE_DIR}/api-server"
|
||||
|
||||
BACKUP_SCRIPT_SOURCE="${SOURCE_DIR}/scripts/database-backup-to-oss.mjs"
|
||||
WORKSPACE_BACKUP_SCRIPT_SOURCE="$(cd "${SCRIPT_DIR}/../.." && pwd)/scripts/database-backup-to-oss.mjs"
|
||||
mkdir -p "${RELEASE_DIR}/scripts"
|
||||
HEALTH_PATROL_SCRIPT_SOURCE="${SOURCE_DIR}/scripts/ops/production-health-patrol.mjs"
|
||||
WORKSPACE_HEALTH_PATROL_SCRIPT_SOURCE="$(cd "${SCRIPT_DIR}/../.." && pwd)/scripts/ops/production-health-patrol.mjs"
|
||||
mkdir -p "${RELEASE_DIR}/scripts" "${RELEASE_DIR}/scripts/ops"
|
||||
if [[ ! -f "${BACKUP_SCRIPT_SOURCE}" ]]; then
|
||||
if [[ -f "${WORKSPACE_BACKUP_SCRIPT_SOURCE}" ]]; then
|
||||
echo "[production-api-deploy] 发布产物缺少 scripts/database-backup-to-oss.mjs,回退使用部署工作区脚本;请重新触发包含该脚本的 API 构建。" >&2
|
||||
@@ -346,6 +348,19 @@ if [[ ! -f "${BACKUP_SCRIPT_SOURCE}" ]]; then
|
||||
fi
|
||||
cp "${BACKUP_SCRIPT_SOURCE}" "${RELEASE_DIR}/scripts/database-backup-to-oss.mjs"
|
||||
chmod 0644 "${RELEASE_DIR}/scripts/database-backup-to-oss.mjs"
|
||||
if [[ ! -f "${HEALTH_PATROL_SCRIPT_SOURCE}" ]]; then
|
||||
if [[ -f "${WORKSPACE_HEALTH_PATROL_SCRIPT_SOURCE}" ]]; then
|
||||
echo "[production-api-deploy] 发布产物缺少 scripts/ops/production-health-patrol.mjs,回退使用部署工作区脚本;请重新触发包含该脚本的 API 构建。" >&2
|
||||
HEALTH_PATROL_SCRIPT_SOURCE="${WORKSPACE_HEALTH_PATROL_SCRIPT_SOURCE}"
|
||||
else
|
||||
echo "[production-api-deploy] 未找到生产健康巡检脚本,跳过复制;genarrative-health-patrol.service 会因脚本缺失而跳过执行。" >&2
|
||||
HEALTH_PATROL_SCRIPT_SOURCE=""
|
||||
fi
|
||||
fi
|
||||
if [[ -n "${HEALTH_PATROL_SCRIPT_SOURCE}" ]]; then
|
||||
cp "${HEALTH_PATROL_SCRIPT_SOURCE}" "${RELEASE_DIR}/scripts/ops/production-health-patrol.mjs"
|
||||
chmod 0644 "${RELEASE_DIR}/scripts/ops/production-health-patrol.mjs"
|
||||
fi
|
||||
|
||||
if [[ -f "${SOURCE_DIR}/release-manifest.json" ]]; then
|
||||
cp "${SOURCE_DIR}/release-manifest.json" "${RELEASE_DIR}/release-manifest.api-server.json"
|
||||
|
||||
@@ -732,10 +732,20 @@ render_database_backup_service() {
|
||||
deploy/systemd/genarrative-database-backup.service
|
||||
}
|
||||
|
||||
render_health_patrol_service() {
|
||||
local current_escaped
|
||||
current_escaped="$(escape_sed_replacement "${CURRENT_LINK}")"
|
||||
sed \
|
||||
-e "s|/opt/genarrative/current|${current_escaped}|g" \
|
||||
deploy/systemd/genarrative-health-patrol.service
|
||||
}
|
||||
|
||||
require_path deploy/systemd/spacetimedb.service
|
||||
require_path deploy/systemd/genarrative-api.service
|
||||
require_path deploy/systemd/genarrative-database-backup.service
|
||||
require_path deploy/systemd/genarrative-database-backup.timer
|
||||
require_path deploy/systemd/genarrative-health-patrol.service
|
||||
require_path deploy/systemd/genarrative-health-patrol.timer
|
||||
require_path deploy/systemd/otelcol-contrib.service
|
||||
require_path deploy/otelcol/genarrative-debug.yaml
|
||||
require_path deploy/nginx/genarrative.conf
|
||||
@@ -754,7 +764,7 @@ echo "[server-provision] target=${DEPLOY_TARGET}, dry_run=${DRY_RUN}, nginx_conf
|
||||
run_cmd id
|
||||
require_root_for_real_provision
|
||||
install_nginx_brotli_modules
|
||||
run_cmd mkdir -p "${SPACETIME_ROOT}" "${RELEASE_ROOT}" "$(dirname "${CURRENT_LINK}")" "$(dirname "${WEB_LINK}")" /etc/genarrative /var/lib/genarrative/maintenance /var/lib/genarrative/auth /var/lib/genarrative/tracking-outbox /var/lib/genarrative/database-backups
|
||||
run_cmd mkdir -p "${SPACETIME_ROOT}" "${RELEASE_ROOT}" "$(dirname "${CURRENT_LINK}")" "$(dirname "${WEB_LINK}")" /etc/genarrative /var/lib/genarrative/maintenance /var/lib/genarrative/auth /var/lib/genarrative/tracking-outbox /var/lib/genarrative/database-backups /var/lib/genarrative/health-patrol
|
||||
|
||||
if ! id spacetimedb >/dev/null 2>&1; then
|
||||
run_cmd useradd --system --home-dir "${SPACETIME_ROOT}" --shell /usr/sbin/nologin spacetimedb
|
||||
@@ -786,14 +796,18 @@ sync_spacetime_install "${SPACETIME_ROOT}"
|
||||
spacetimedb_service="$(mktemp)"
|
||||
api_service="$(mktemp)"
|
||||
database_backup_service="$(mktemp)"
|
||||
health_patrol_service="$(mktemp)"
|
||||
render_spacetimedb_service >"${spacetimedb_service}"
|
||||
render_api_service >"${api_service}"
|
||||
render_database_backup_service >"${database_backup_service}"
|
||||
render_health_patrol_service >"${health_patrol_service}"
|
||||
install_file "${spacetimedb_service}" /etc/systemd/system/spacetimedb.service 0644
|
||||
install_file "${api_service}" /etc/systemd/system/genarrative-api.service 0644
|
||||
install_file "${database_backup_service}" /etc/systemd/system/genarrative-database-backup.service 0644
|
||||
install_file deploy/systemd/genarrative-database-backup.timer /etc/systemd/system/genarrative-database-backup.timer 0644
|
||||
rm -f "${spacetimedb_service}" "${api_service}" "${database_backup_service}"
|
||||
install_file "${health_patrol_service}" /etc/systemd/system/genarrative-health-patrol.service 0644
|
||||
install_file deploy/systemd/genarrative-health-patrol.timer /etc/systemd/system/genarrative-health-patrol.timer 0644
|
||||
rm -f "${spacetimedb_service}" "${api_service}" "${database_backup_service}" "${health_patrol_service}"
|
||||
|
||||
if [[ ! -f "${API_ENV_FILE}" ]]; then
|
||||
echo "+ create ${API_ENV_FILE} from example"
|
||||
@@ -828,7 +842,7 @@ if [[ "${ENABLE_SERVICES}" == "true" ]]; then
|
||||
if [[ "${ENABLE_OTELCOL:-true}" == "true" ]]; then
|
||||
run_cmd systemctl enable otelcol-contrib.service
|
||||
fi
|
||||
run_cmd systemctl enable spacetimedb.service genarrative-api.service genarrative-database-backup.timer
|
||||
run_cmd systemctl enable spacetimedb.service genarrative-api.service genarrative-database-backup.timer genarrative-health-patrol.timer
|
||||
if [[ "${ENABLE_OTELCOL:-true}" == "true" ]]; then
|
||||
run_cmd systemctl restart otelcol-contrib.service
|
||||
fi
|
||||
|
||||
477
scripts/ops/production-health-patrol.mjs
Normal file
477
scripts/ops/production-health-patrol.mjs
Normal file
@@ -0,0 +1,477 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import {execFile} from 'node:child_process';
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import {mkdir, writeFile} from 'node:fs/promises';
|
||||
import {dirname} from 'node:path';
|
||||
|
||||
const STATUS_RANK = {
|
||||
OK: 0,
|
||||
WARNING: 1,
|
||||
CRITICAL: 2,
|
||||
};
|
||||
|
||||
const DEFAULT_PUBLIC_PATHS = [
|
||||
'/api/creation-entry/config',
|
||||
'/api/runtime/puzzle/gallery',
|
||||
'/api/runtime/custom-world-gallery',
|
||||
];
|
||||
|
||||
const DEFAULT_SERVICES = [
|
||||
'genarrative-api.service',
|
||||
'spacetimedb.service',
|
||||
'nginx.service',
|
||||
];
|
||||
|
||||
function usage() {
|
||||
console.log(`Usage:
|
||||
node scripts/ops/production-health-patrol.mjs [options]
|
||||
|
||||
Options:
|
||||
--api-base-url <url> API direct base URL, default http://127.0.0.1:8082
|
||||
--spacetime-base-url <url> SpacetimeDB base URL, default http://127.0.0.1:3101
|
||||
--public-base-url <url> Nginx/public base URL, default http://127.0.0.1
|
||||
--public-path <path> Public API path to probe; repeatable
|
||||
--status-file <path> Write the last patrol result as JSON
|
||||
--timeout-ms <ms> HTTP/command timeout, default 5000
|
||||
--slow-ms <ms> Mark successful probes slower than this as WARNING, default 3000
|
||||
--fail-on-warning Exit 1 when the total status is WARNING
|
||||
--skip-journal Skip recent journal error scan
|
||||
--json Print JSON instead of text
|
||||
`);
|
||||
}
|
||||
|
||||
function readBoolEnv(name, fallback = false) {
|
||||
const value = process.env[name];
|
||||
if (!value) {
|
||||
return fallback;
|
||||
}
|
||||
return ['1', 'true', 'yes', 'on'].includes(value.trim().toLowerCase());
|
||||
}
|
||||
|
||||
function parsePositiveInt(raw, fallback) {
|
||||
const value = Number.parseInt(String(raw ?? ''), 10);
|
||||
return Number.isFinite(value) && value > 0 ? value : fallback;
|
||||
}
|
||||
|
||||
function parseArgs(argv) {
|
||||
const config = {
|
||||
apiBaseUrl:
|
||||
process.env.GENARRATIVE_HEALTH_PATROL_API_BASE_URL ||
|
||||
'http://127.0.0.1:8082',
|
||||
spacetimeBaseUrl:
|
||||
process.env.GENARRATIVE_HEALTH_PATROL_SPACETIME_BASE_URL ||
|
||||
'http://127.0.0.1:3101',
|
||||
publicBaseUrl:
|
||||
process.env.GENARRATIVE_HEALTH_PATROL_PUBLIC_BASE_URL ||
|
||||
process.env.GENARRATIVE_HEALTH_PATROL_API_BASE_URL ||
|
||||
'http://127.0.0.1:8082',
|
||||
publicPaths: [],
|
||||
statusFile: process.env.GENARRATIVE_HEALTH_PATROL_STATUS_FILE || '',
|
||||
timeoutMs: parsePositiveInt(
|
||||
process.env.GENARRATIVE_HEALTH_PATROL_TIMEOUT_MS,
|
||||
5000,
|
||||
),
|
||||
slowMs: parsePositiveInt(
|
||||
process.env.GENARRATIVE_HEALTH_PATROL_SLOW_MS,
|
||||
3000,
|
||||
),
|
||||
failOnWarning: readBoolEnv('GENARRATIVE_HEALTH_PATROL_FAIL_ON_WARNING'),
|
||||
skipJournal: readBoolEnv('GENARRATIVE_HEALTH_PATROL_SKIP_JOURNAL'),
|
||||
json: false,
|
||||
webhookUrl: process.env.GENARRATIVE_HEALTH_PATROL_WEBHOOK_URL || '',
|
||||
};
|
||||
|
||||
for (let index = 0; index < argv.length; index += 1) {
|
||||
const arg = argv[index];
|
||||
switch (arg) {
|
||||
case '-h':
|
||||
case '--help':
|
||||
usage();
|
||||
process.exit(0);
|
||||
break;
|
||||
case '--api-base-url':
|
||||
config.apiBaseUrl = requireValue(argv, ++index, arg);
|
||||
break;
|
||||
case '--spacetime-base-url':
|
||||
config.spacetimeBaseUrl = requireValue(argv, ++index, arg);
|
||||
break;
|
||||
case '--public-base-url':
|
||||
config.publicBaseUrl = requireValue(argv, ++index, arg);
|
||||
break;
|
||||
case '--public-path':
|
||||
config.publicPaths.push(requireValue(argv, ++index, arg));
|
||||
break;
|
||||
case '--status-file':
|
||||
config.statusFile = requireValue(argv, ++index, arg);
|
||||
break;
|
||||
case '--timeout-ms':
|
||||
config.timeoutMs = parsePositiveInt(requireValue(argv, ++index, arg), 5000);
|
||||
break;
|
||||
case '--slow-ms':
|
||||
config.slowMs = parsePositiveInt(requireValue(argv, ++index, arg), 3000);
|
||||
break;
|
||||
case '--fail-on-warning':
|
||||
config.failOnWarning = true;
|
||||
break;
|
||||
case '--skip-journal':
|
||||
config.skipJournal = true;
|
||||
break;
|
||||
case '--json':
|
||||
config.json = true;
|
||||
break;
|
||||
default:
|
||||
throw new Error(`未知参数: ${arg}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (config.publicPaths.length === 0) {
|
||||
config.publicPaths = DEFAULT_PUBLIC_PATHS;
|
||||
}
|
||||
|
||||
return config;
|
||||
}
|
||||
|
||||
function requireValue(argv, index, flag) {
|
||||
const value = argv[index];
|
||||
if (!value || value.startsWith('--')) {
|
||||
throw new Error(`${flag} 缺少参数值`);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function joinUrl(baseUrl, path) {
|
||||
const base = baseUrl.endsWith('/') ? baseUrl.slice(0, -1) : baseUrl;
|
||||
const suffix = path.startsWith('/') ? path : `/${path}`;
|
||||
return `${base}${suffix}`;
|
||||
}
|
||||
|
||||
function maxStatus(checks) {
|
||||
return checks.reduce((current, check) => {
|
||||
return STATUS_RANK[check.status] > STATUS_RANK[current] ? check.status : current;
|
||||
}, 'OK');
|
||||
}
|
||||
|
||||
function checkResult(name, status, summary, details = {}) {
|
||||
return {
|
||||
name,
|
||||
status,
|
||||
summary,
|
||||
...details,
|
||||
};
|
||||
}
|
||||
|
||||
function runCommand(command, args, timeoutMs) {
|
||||
return new Promise((resolve) => {
|
||||
execFile(
|
||||
command,
|
||||
args,
|
||||
{
|
||||
timeout: timeoutMs,
|
||||
windowsHide: true,
|
||||
maxBuffer: 256 * 1024,
|
||||
},
|
||||
(error, stdout, stderr) => {
|
||||
resolve({
|
||||
command: [command, ...args].join(' '),
|
||||
code:
|
||||
typeof error?.code === 'number'
|
||||
? error.code
|
||||
: error
|
||||
? 1
|
||||
: 0,
|
||||
signal: error?.signal || '',
|
||||
stdout: String(stdout || ''),
|
||||
stderr: String(stderr || ''),
|
||||
timedOut: Boolean(error?.killed),
|
||||
error: error ? error.message : '',
|
||||
});
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
async function checkService(serviceName, timeoutMs) {
|
||||
const result = await runCommand(
|
||||
'systemctl',
|
||||
['is-active', serviceName],
|
||||
timeoutMs,
|
||||
);
|
||||
const state = result.stdout.trim() || result.stderr.trim() || result.error;
|
||||
if (result.code === 0 && state === 'active') {
|
||||
return checkResult(`service:${serviceName}`, 'OK', 'active', {
|
||||
command: result.command,
|
||||
});
|
||||
}
|
||||
|
||||
return checkResult(
|
||||
`service:${serviceName}`,
|
||||
'CRITICAL',
|
||||
`服务状态异常: ${state || `exit ${result.code}`}`,
|
||||
{
|
||||
command: result.command,
|
||||
stderr: result.stderr.trim(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
function requestUrl(url, timeoutMs) {
|
||||
return new Promise((resolve) => {
|
||||
const startedAt = Date.now();
|
||||
const parsed = new URL(url);
|
||||
const client = parsed.protocol === 'https:' ? https : http;
|
||||
const request = client.request(
|
||||
parsed,
|
||||
{
|
||||
method: 'GET',
|
||||
timeout: timeoutMs,
|
||||
headers: {
|
||||
'User-Agent': 'genarrative-health-patrol/1.0',
|
||||
Accept: 'application/json,text/plain,*/*',
|
||||
},
|
||||
},
|
||||
(response) => {
|
||||
let body = '';
|
||||
response.setEncoding('utf8');
|
||||
response.on('data', (chunk) => {
|
||||
if (body.length < 2048) {
|
||||
body += chunk;
|
||||
}
|
||||
});
|
||||
response.on('end', () => {
|
||||
resolve({
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
statusCode: response.statusCode || 0,
|
||||
body: body.slice(0, 2048),
|
||||
});
|
||||
});
|
||||
},
|
||||
);
|
||||
|
||||
request.on('timeout', () => {
|
||||
request.destroy(new Error(`timeout after ${timeoutMs}ms`));
|
||||
});
|
||||
request.on('error', (error) => {
|
||||
resolve({
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
error: error.message,
|
||||
});
|
||||
});
|
||||
request.end();
|
||||
});
|
||||
}
|
||||
|
||||
async function checkHttp(name, url, config) {
|
||||
const result = await requestUrl(url, config.timeoutMs);
|
||||
const curlCommand = `curl -fsS --max-time ${Math.ceil(config.timeoutMs / 1000)} ${url}`;
|
||||
|
||||
if (result.error) {
|
||||
return checkResult(name, 'CRITICAL', `请求失败: ${result.error}`, {
|
||||
command: curlCommand,
|
||||
elapsedMs: result.elapsedMs,
|
||||
});
|
||||
}
|
||||
|
||||
const ok = result.statusCode >= 200 && result.statusCode < 300;
|
||||
if (!ok) {
|
||||
return checkResult(
|
||||
name,
|
||||
'CRITICAL',
|
||||
`HTTP ${result.statusCode},耗时 ${result.elapsedMs}ms`,
|
||||
{
|
||||
command: curlCommand,
|
||||
elapsedMs: result.elapsedMs,
|
||||
body: result.body.trim(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if (result.elapsedMs > config.slowMs) {
|
||||
return checkResult(
|
||||
name,
|
||||
'WARNING',
|
||||
`HTTP ${result.statusCode} 但耗时偏高: ${result.elapsedMs}ms`,
|
||||
{
|
||||
command: curlCommand,
|
||||
elapsedMs: result.elapsedMs,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
return checkResult(name, 'OK', `HTTP ${result.statusCode} ${result.elapsedMs}ms`, {
|
||||
command: curlCommand,
|
||||
elapsedMs: result.elapsedMs,
|
||||
});
|
||||
}
|
||||
|
||||
async function checkRecentJournal(config) {
|
||||
const args = [
|
||||
'-u',
|
||||
'genarrative-api.service',
|
||||
'-u',
|
||||
'spacetimedb.service',
|
||||
'-u',
|
||||
'nginx.service',
|
||||
'--since',
|
||||
'15 minutes ago',
|
||||
'-p',
|
||||
'err..alert',
|
||||
'--no-pager',
|
||||
'-o',
|
||||
'short-iso',
|
||||
'-n',
|
||||
'20',
|
||||
];
|
||||
const result = await runCommand('journalctl', args, config.timeoutMs);
|
||||
|
||||
if (result.code !== 0) {
|
||||
return checkResult('journal:recent-errors', 'WARNING', '无法读取最近错误日志', {
|
||||
command: result.command,
|
||||
stderr: result.stderr.trim() || result.error,
|
||||
});
|
||||
}
|
||||
|
||||
const lines = result.stdout
|
||||
.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line && line !== '-- No entries --');
|
||||
|
||||
if (lines.length === 0) {
|
||||
return checkResult('journal:recent-errors', 'OK', '最近 15 分钟无 err..alert 日志', {
|
||||
command: result.command,
|
||||
});
|
||||
}
|
||||
|
||||
return checkResult(
|
||||
'journal:recent-errors',
|
||||
'WARNING',
|
||||
`最近 15 分钟有 ${lines.length} 条 err..alert 日志`,
|
||||
{
|
||||
command: result.command,
|
||||
lines,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
async function writeStatusFile(statusFile, payload) {
|
||||
if (!statusFile) {
|
||||
return;
|
||||
}
|
||||
await mkdir(dirname(statusFile), {recursive: true});
|
||||
await writeFile(statusFile, `${JSON.stringify(payload, null, 2)}\n`, 'utf8');
|
||||
}
|
||||
|
||||
async function notifyWebhook(config, payload) {
|
||||
if (!config.webhookUrl || payload.status === 'OK') {
|
||||
return;
|
||||
}
|
||||
|
||||
const body = JSON.stringify(payload);
|
||||
const parsed = new URL(config.webhookUrl);
|
||||
const client = parsed.protocol === 'https:' ? https : http;
|
||||
|
||||
await new Promise((resolve) => {
|
||||
const request = client.request(
|
||||
parsed,
|
||||
{
|
||||
method: 'POST',
|
||||
timeout: config.timeoutMs,
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'Content-Length': Buffer.byteLength(body),
|
||||
},
|
||||
},
|
||||
(response) => {
|
||||
response.resume();
|
||||
response.on('end', resolve);
|
||||
},
|
||||
);
|
||||
request.on('timeout', () => {
|
||||
request.destroy(new Error(`timeout after ${config.timeoutMs}ms`));
|
||||
});
|
||||
request.on('error', (error) => {
|
||||
console.error(`[health-patrol] webhook notify failed: ${error.message}`);
|
||||
resolve();
|
||||
});
|
||||
request.end(body);
|
||||
});
|
||||
}
|
||||
|
||||
function printText(payload) {
|
||||
console.log(`[health-patrol] ${payload.status} ${payload.checkedAt}`);
|
||||
for (const check of payload.checks) {
|
||||
console.log(`[${check.status}] ${check.name}: ${check.summary}`);
|
||||
if (check.command && check.status !== 'OK') {
|
||||
console.log(` command: ${check.command}`);
|
||||
}
|
||||
if (check.stderr) {
|
||||
console.log(` stderr: ${check.stderr}`);
|
||||
}
|
||||
if (check.body) {
|
||||
console.log(` body: ${check.body}`);
|
||||
}
|
||||
if (Array.isArray(check.lines) && check.lines.length > 0) {
|
||||
for (const line of check.lines) {
|
||||
console.log(` ${line}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const config = parseArgs(process.argv.slice(2));
|
||||
const checks = [];
|
||||
|
||||
for (const serviceName of DEFAULT_SERVICES) {
|
||||
checks.push(await checkService(serviceName, config.timeoutMs));
|
||||
}
|
||||
|
||||
checks.push(await checkHttp('api:/healthz', joinUrl(config.apiBaseUrl, '/healthz'), config));
|
||||
checks.push(await checkHttp('api:/readyz', joinUrl(config.apiBaseUrl, '/readyz'), config));
|
||||
checks.push(
|
||||
await checkHttp(
|
||||
'spacetimedb:/v1/ping',
|
||||
joinUrl(config.spacetimeBaseUrl, '/v1/ping'),
|
||||
config,
|
||||
),
|
||||
);
|
||||
|
||||
for (const path of config.publicPaths) {
|
||||
checks.push(
|
||||
await checkHttp(`public:${path}`, joinUrl(config.publicBaseUrl, path), config),
|
||||
);
|
||||
}
|
||||
|
||||
if (!config.skipJournal) {
|
||||
checks.push(await checkRecentJournal(config));
|
||||
}
|
||||
|
||||
const payload = {
|
||||
status: maxStatus(checks),
|
||||
checkedAt: new Date().toISOString(),
|
||||
host: process.env.HOSTNAME || '',
|
||||
checks,
|
||||
};
|
||||
|
||||
await writeStatusFile(config.statusFile, payload);
|
||||
await notifyWebhook(config, payload);
|
||||
|
||||
if (config.json) {
|
||||
console.log(JSON.stringify(payload, null, 2));
|
||||
} else {
|
||||
printText(payload);
|
||||
}
|
||||
|
||||
if (payload.status === 'CRITICAL') {
|
||||
process.exit(2);
|
||||
}
|
||||
if (payload.status === 'WARNING' && config.failOnWarning) {
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error(`[health-patrol] CRITICAL ${error instanceof Error ? error.message : String(error)}`);
|
||||
process.exit(2);
|
||||
});
|
||||
Reference in New Issue
Block a user