feat(api-server): add container loadtest observability
This commit is contained in:
99
scripts/container-compose.mjs
Normal file
99
scripts/container-compose.mjs
Normal file
@@ -0,0 +1,99 @@
|
||||
import {spawn} from 'node:child_process';
|
||||
import {copyFileSync, existsSync} from 'node:fs';
|
||||
import path from 'node:path';
|
||||
|
||||
const [, , rawCommand = 'help', ...args] = process.argv;
|
||||
const command = rawCommand.trim();
|
||||
const printComposeConfig = args.includes('--print');
|
||||
const passthroughArgs = args.filter((arg) => arg !== '--print');
|
||||
const projectRoot = process.cwd();
|
||||
const composeFile = path.join('deploy', 'container', 'docker-compose.loadtest.yml');
|
||||
const envExamplePath = path.join('deploy', 'container', 'api-server.env.example');
|
||||
const envPath = path.join('deploy', 'container', 'api-server.env');
|
||||
|
||||
const supportedCommands = new Set(['init', 'build', 'up', 'down', 'logs', 'ps', 'config', 'k6']);
|
||||
|
||||
if (command === 'help' || !supportedCommands.has(command)) {
|
||||
printHelp(command !== 'help');
|
||||
process.exit(command === 'help' ? 0 : 1);
|
||||
}
|
||||
|
||||
if (command === 'init') {
|
||||
ensureEnvFile();
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (!existsSync(envPath)) {
|
||||
ensureEnvFile();
|
||||
console.error('[container] 请先检查 deploy/container/api-server.env 中的 SpacetimeDB 地址、库名和 token。');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const composeArgs = buildComposeArgs(command, passthroughArgs);
|
||||
const child = spawn('docker', composeArgs, {
|
||||
cwd: projectRoot,
|
||||
env: process.env,
|
||||
stdio: 'inherit',
|
||||
shell: false,
|
||||
});
|
||||
|
||||
child.on('error', (error) => {
|
||||
console.error(`[container] docker compose 启动失败: ${error.message}`);
|
||||
console.error('[container] 请确认 Docker Desktop 或 Docker Engine 已安装,并且 docker 在 PATH 中。');
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
child.on('exit', (code, signal) => {
|
||||
if (signal) {
|
||||
console.error(`[container] docker compose 被信号终止: ${signal}`);
|
||||
process.exit(1);
|
||||
}
|
||||
process.exit(code ?? 0);
|
||||
});
|
||||
|
||||
function buildComposeArgs(selectedCommand, extraArgs) {
|
||||
const baseArgs = ['compose', '-f', composeFile];
|
||||
switch (selectedCommand) {
|
||||
case 'build':
|
||||
return [...baseArgs, 'build', ...extraArgs];
|
||||
case 'up':
|
||||
return [...baseArgs, 'up', '-d', ...extraArgs];
|
||||
case 'down':
|
||||
return [...baseArgs, 'down', ...extraArgs];
|
||||
case 'logs':
|
||||
return [...baseArgs, 'logs', ...extraArgs];
|
||||
case 'ps':
|
||||
return [...baseArgs, 'ps', ...extraArgs];
|
||||
case 'config':
|
||||
return [...baseArgs, 'config', ...(printComposeConfig ? [] : ['--quiet']), ...extraArgs];
|
||||
case 'k6':
|
||||
return [...baseArgs, '--profile', 'loadtest', 'run', '--rm', 'k6', ...extraArgs];
|
||||
default:
|
||||
throw new Error(`unsupported command: ${selectedCommand}`);
|
||||
}
|
||||
}
|
||||
|
||||
function ensureEnvFile() {
|
||||
if (existsSync(envPath)) {
|
||||
console.log(`[container] 已存在 ${envPath}`);
|
||||
return;
|
||||
}
|
||||
copyFileSync(envExamplePath, envPath);
|
||||
console.log(`[container] 已从 ${envExamplePath} 生成 ${envPath}`);
|
||||
}
|
||||
|
||||
function printHelp(isError) {
|
||||
const output = isError ? console.error : console.log;
|
||||
output(`Usage: npm run container:<command> -- [docker compose args]
|
||||
|
||||
Commands:
|
||||
container:init 生成 deploy/container/api-server.env
|
||||
container:build 构建 api-server 容器镜像
|
||||
container:up 后台启动 api-server + nginx + otelcol
|
||||
container:down 停止并清理容器
|
||||
container:logs 查看容器日志
|
||||
container:ps 查看容器状态
|
||||
container:config 校验 compose 配置,传 -- --print 可展开完整配置
|
||||
container:k6 在 compose 网络内运行 k6
|
||||
`);
|
||||
}
|
||||
@@ -312,6 +312,25 @@ OTLP logs 是远端观测增量,不替代本地日志;api-server 日志仍
|
||||
|
||||
Rider 的 Logs 面板展示的是 OTLP log event 自身字段,不会自动把父 span 的全部 attributes 摊平到每一条日志。请求完成日志会直接携带 `request_id`、`http.request.method`、`http.route`、`url.scheme`、`url.path`、`http.response.status_code`、`status_class`、`latency_ms` 和 `slow_request`;更完整的请求链路仍在 Traces 面板中按同一个 trace/span 关联查看。
|
||||
|
||||
压测期间可在 Metrics 面板或 debug exporter 中观察进程内存指标:
|
||||
|
||||
- `process.memory.usage`:进程常驻内存 / RSS。
|
||||
- `process.memory.virtual`:进程虚拟内存;Windows 当前按 `PrivateUsage` 上报,Linux 取 `VmSize`。
|
||||
- `genarrative.process.memory.private`:进程私有内存,Windows 来自 `PrivateUsage`,Linux 近似取 `/proc/self/status` 的 `VmData`。
|
||||
- `process.thread.count`:线程数。
|
||||
- `process.windows.handle.count`:Windows 句柄数。
|
||||
- `process.unix.file_descriptor.count`:Linux 文件描述符数。
|
||||
- `genarrative.http.server.response_bodies.in_flight`:Axum / Hyper 仍持有的响应 body 数;如果内存高但该值很低,说明热点不在业务 handler 生命周期内。
|
||||
- `genarrative.http.server.request_permits.available`:应用层 HTTP 背压剩余 permit 数;如果该值未接近 0,说明没有打满 `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS`。
|
||||
- `genarrative.puzzle_gallery.cache.hits` / `genarrative.puzzle_gallery.cache.misses` / `genarrative.puzzle_gallery.cache.rebuilds`:拼图广场响应缓存命中、未命中和重建次数。
|
||||
- `genarrative.puzzle_gallery.cache.rebuild.duration`:拼图广场缓存重建耗时。
|
||||
- `genarrative.puzzle_gallery.cache.data_json_bytes`:拼图广场缓存内预序列化 data JSON 大小。
|
||||
- `genarrative.spacetime.read.calls` / `genarrative.spacetime.read.duration_ms`:SpacetimeDB 订阅本地 cache 读次数和耗时;`read=list_puzzle_gallery` 表示当前路径走 view / local cache,不是 procedure。
|
||||
|
||||
若 `/api/runtime/puzzle/gallery` 单接口压测出现 GB 级瞬时内存峰值,先区分“持续泄漏”和“请求期分配峰值”:关闭 OTEL 后若峰值仍复现且压测结束后回落,主因通常不是 Collector / exporter。当前拼图广场列表命中缓存时应复用 `PuzzleGalleryCache` 中的预序列化 data JSON,只按请求拼接 envelope meta,不应每个请求重新深拷贝 `PuzzleGalleryResponse` 或构造完整 `serde_json::Value`。
|
||||
|
||||
本地 Windows 直连 `api-server` 压测还要单独看 K6 的 VU / 连接模型。已验证在 250 RPS、`PREALLOCATED_VUS=300` 时,哪怕打 `/healthz` 这种小响应,也可能因为本地 300 个 Established 连接触发 `api-server` private memory 瞬时升到约 7GB,压测结束后回落到 100MB 级;同样 250 RPS 改成 `PREALLOCATED_VUS=20 MAX_VUS=40` 后,拼图广场 p95 约 9ms,峰值降到约 600MB。这个现象说明高水位主要来自本机直连连接 / 发送链路,不等价于 SpacetimeDB 或拼图 JSON 缓存泄漏。做本地容量判断时优先让 VU 接近真实并发,避免用过高预分配 VU 把测试变成 Windows 本机连接缓冲压力测试;生产仍以 Nginx upstream keepalive、系统内存和 OTLP 指标一起判断。
|
||||
|
||||
线上回归辅助命令:
|
||||
|
||||
```bash
|
||||
|
||||
Reference in New Issue
Block a user