perf(api-server): tune gallery load shedding
This commit is contained in:
@@ -226,7 +226,7 @@ npm run loadtest:k6:works
|
||||
## 排障
|
||||
|
||||
- 如果公开 gallery 返回 `creation_entry_disabled` 或 503,检查本地 creation entry 配置是否禁用了对应入口。
|
||||
- 如果高压下返回 429,优先确认目标环境是否设置了 `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS`。429 表示 api-server 应用层背压已生效,不等同于业务错误;继续看内存、p95、`http_req_failed` 和 OTLP / Nginx timing 判断阈值是否偏低。
|
||||
- 如果高压下返回 429,优先确认目标环境是否设置了 `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS` 以及 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS`、`GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS`、`GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS`。429 表示 Nginx 或 api-server 背压已生效,不等同于业务错误;继续看内存、p95、`http_req_failed` 和 OTLP / Nginx timing 判断阈值是否偏低。
|
||||
- 如果直连 `api-server` 压测出现 `connection refused` 或 status 0,说明压力已经打到 TCP 监听 / accept 层;此时同时检查 `GENARRATIVE_API_LISTEN_BACKLOG`、Nginx upstream keepalive 和是否需要在 Nginx 前置限流,不能只靠应用层背压解释。
|
||||
- 如果个人作品列表返回 401,确认 `AUTH_TOKEN` 是当前 api-server 可识别的 access token。
|
||||
- 如果详情全部 404,确认是否已向目标环境导入与 `WORKS_DATA` 一致的数据。
|
||||
@@ -317,12 +317,14 @@ Rider 的 Logs 面板展示的是 OTLP log event 自身字段,不会自动把
|
||||
- `process.memory.usage`:进程常驻内存 / RSS。
|
||||
- `process.memory.virtual`:进程虚拟内存;Windows 当前按 `PrivateUsage` 上报,Linux 取 `VmSize`。
|
||||
- `genarrative.process.memory.private`:进程私有内存,Windows 来自 `PrivateUsage`,Linux 近似取 `/proc/self/status` 的 `VmData`。
|
||||
- `process.cpu.time`:进程 user + system 累计 CPU 秒数。
|
||||
- `genarrative.process.cpu.usage_percent`:两次指标采集之间的进程 CPU 使用率;100% 约等于占满 1 个 CPU core。
|
||||
- `process.thread.count`:线程数。
|
||||
- `process.windows.handle.count`:Windows 句柄数。
|
||||
- `process.unix.file_descriptor.count`:Linux 文件描述符数。
|
||||
- `genarrative.http.server.response_bodies.in_flight`:Axum / Hyper 仍持有的响应 body 数;如果内存高但该值很低,说明热点不在业务 handler 生命周期内。
|
||||
- `genarrative.http.server.request_permits.available`:应用层 HTTP 背压剩余 permit 数;如果该值未接近 0,说明没有打满 `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS`。
|
||||
- `genarrative.puzzle_gallery.cache.hits` / `genarrative.puzzle_gallery.cache.misses` / `genarrative.puzzle_gallery.cache.rebuilds`:拼图广场响应缓存命中、未命中和重建次数。
|
||||
- `genarrative.http.server.request_permits.available`:应用层 HTTP 背压剩余 permit 数,带 `pool=default|gallery|detail|admin`;如果目标 pool 未接近 0,说明没有打满对应 `GENARRATIVE_API_*_MAX_CONCURRENT_REQUESTS`。
|
||||
- `genarrative.puzzle_gallery.cache.hits` / `genarrative.puzzle_gallery.cache.stale_hits` / `genarrative.puzzle_gallery.cache.misses` / `genarrative.puzzle_gallery.cache.refreshes_started` / `genarrative.puzzle_gallery.cache.refreshes_failed` / `genarrative.puzzle_gallery.cache.rebuilds`:拼图广场响应缓存 fresh 命中、stale 命中、未命中、后台刷新和重建次数。
|
||||
- `genarrative.puzzle_gallery.cache.rebuild.duration`:拼图广场缓存重建耗时。
|
||||
- `genarrative.puzzle_gallery.cache.data_json_bytes`:拼图广场缓存内预序列化 data JSON 大小。
|
||||
- `genarrative.spacetime.read.calls` / `genarrative.spacetime.read.duration_ms`:SpacetimeDB 订阅本地 cache 读次数和耗时;`read=list_puzzle_gallery` 表示当前路径走 view / local cache,不是 procedure。
|
||||
@@ -336,7 +338,7 @@ Rider 的 Logs 面板展示的是 OTLP log event 自身字段,不会自动把
|
||||
```bash
|
||||
systemctl show genarrative-api.service -p LimitNOFILE -p TasksMax
|
||||
cat /proc/$(pidof api-server)/limits
|
||||
tr '\0' '\n' < /proc/$(pidof api-server)/environ | grep GENARRATIVE_API_MAX_CONCURRENT_REQUESTS
|
||||
tr '\0' '\n' < /proc/$(pidof api-server)/environ | grep 'GENARRATIVE_API_.*MAX_CONCURRENT_REQUESTS'
|
||||
ss -ltnp | grep 8082
|
||||
curl -sS http://127.0.0.1:8082/healthz
|
||||
```
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
{
|
||||
"source": "spacetime-migration-7.local.json",
|
||||
"generatedAt": "2026-05-11T13:09:51.569Z",
|
||||
"source": "spacetime-migration-1.json",
|
||||
"generatedAt": "2026-05-18T11:54:04.280Z",
|
||||
"counts": {
|
||||
"puzzle_work_profile": 3,
|
||||
"custom_world_profile": 1,
|
||||
"match3d_work_profile": 0
|
||||
"match3d_work_profile": 0,
|
||||
"square_hole_work_profile": 0,
|
||||
"visual_novel_work_profile": 0
|
||||
},
|
||||
"tables": {
|
||||
"puzzle_work_profile": [
|
||||
@@ -113,7 +115,9 @@
|
||||
}
|
||||
}
|
||||
],
|
||||
"match3d_work_profile": []
|
||||
"match3d_work_profile": [],
|
||||
"square_hole_work_profile": [],
|
||||
"visual_novel_work_profile": []
|
||||
},
|
||||
"profileIds": {
|
||||
"puzzle": [
|
||||
|
||||
@@ -137,12 +137,12 @@ function unwrapPayload(json) {
|
||||
}
|
||||
|
||||
function hasCollection(payload, keys) {
|
||||
return keys.some((key) => Array.isArray(payload?.[key]));
|
||||
return Boolean(payload) && keys.some((key) => Array.isArray(payload[key]));
|
||||
}
|
||||
|
||||
function firstCollection(payload, keys) {
|
||||
for (const key of keys) {
|
||||
if (Array.isArray(payload?.[key])) return payload[key];
|
||||
if (payload && Array.isArray(payload[key])) return payload[key];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
@@ -152,10 +152,11 @@ function hasListItemShape(payload, keys) {
|
||||
if (collection.length === 0) return true;
|
||||
const item = collection[0];
|
||||
const hasId = Boolean(
|
||||
item?.profileId || item?.profile_id || item?.workId || item?.work_id || item?.publicWorkCode,
|
||||
item &&
|
||||
(item.profileId || item.profile_id || item.workId || item.work_id || item.publicWorkCode),
|
||||
);
|
||||
const hasTitle = Boolean(
|
||||
item?.title || item?.workTitle || item?.work_title || item?.levelName || item?.worldName,
|
||||
item && (item.title || item.workTitle || item.work_title || item.levelName || item.worldName),
|
||||
);
|
||||
return hasId && hasTitle;
|
||||
}
|
||||
@@ -213,7 +214,8 @@ function performDetailRequest() {
|
||||
const payload = unwrapPayload(json);
|
||||
const ok = check(response, {
|
||||
[`${endpoint.name} status is 200`]: (res) => res.status === 200,
|
||||
[`${endpoint.name} has detail payload`]: () => endpoint.expectKeys.some((key) => payload?.[key]),
|
||||
[`${endpoint.name} has detail payload`]: () =>
|
||||
Boolean(payload) && endpoint.expectKeys.some((key) => payload[key]),
|
||||
});
|
||||
worksDetailShapeErrorRate.add(!ok, { endpoint: endpoint.name });
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user