From 3eb292b40394170f30cfa7233cc4ab86135f5ef4 Mon Sep 17 00:00:00 2001 From: kdletters <61648117+kdletters@users.noreply.github.com> Date: Mon, 18 May 2026 16:58:48 +0800 Subject: [PATCH 1/6] feat(deploy): prepare offline provision tools and container loadtest --- .hermes/shared-memory/decision-log.md | 14 +- deploy/container/README.md | 30 +++- deploy/container/api-server.env.example | 7 +- deploy/container/docker-compose.loadtest.yml | 57 +++++++- deploy/container/nginx.conf | 6 +- deploy/otelcol/genarrative-debug.yaml | 23 +++ deploy/systemd/otelcol-contrib.service | 22 +++ ...发运维】本地开发验证与生产运维-2026-05-15.md | 6 +- .../Jenkinsfile.production-server-provision | 80 ++++++++++- scripts/container-compose.mjs | 2 +- scripts/jenkins-server-provision.sh | 125 +++++++++++------ scripts/prepare-server-provision-tools.sh | 132 ++++++++++++++++++ 12 files changed, 443 insertions(+), 61 deletions(-) create mode 100644 deploy/otelcol/genarrative-debug.yaml create mode 100644 deploy/systemd/otelcol-contrib.service create mode 100644 scripts/prepare-server-provision-tools.sh diff --git a/.hermes/shared-memory/decision-log.md b/.hermes/shared-memory/decision-log.md index f34b87e6..060e73c7 100644 --- a/.hermes/shared-memory/decision-log.md +++ b/.hermes/shared-memory/decision-log.md @@ -19,12 +19,22 @@ ## 2026-05-17 容器化方案只作为隔离压测与预发模拟路径 - 背景:Windows 本机直连极高 VU 压测会放大本地连接与发送缓冲行为,和线上 Linux + Nginx + systemd 拓扑不一致;需要一个更接近生产网络层的模拟方案,但不能扰动当前生产发布链路。 -- 决策:新增 `deploy/container/` 容器化方案,使用 Docker Compose 组合 Linux release `api-server`、容器 Nginx、`otelcol-contrib` debug exporter 和可选 k6。该方案只用于本机或预发压测模拟,不替换当前生产 `systemd + Nginx + Jenkins` 路径。 +- 决策:新增 `deploy/container/` 容器化方案,使用 Docker Compose 组合 Linux release `api-server`、容器 SpacetimeDB、容器 Nginx、`otelcol-contrib` debug exporter 和可选 k6。该方案只用于本机或预发压测模拟,不替换当前生产 `systemd + Nginx + Jenkins` 路径。 +- 服务器模拟参数:2026-05-18 通过 `ssh genarrative-release` 采样,目标机器为 2 vCPU / 约 2 GiB RAM / Ubuntu 24.04 / Nginx `worker_connections=768`;容器方案按待发布运行口径使用 `nofile=4096`,并在 compose 中限制 `spacetimedb cpus=1.0 mem_limit=768m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.25 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=0.5 mem_limit=512m`;Collector 镜像默认使用 `otel/opentelemetry-collector-contrib:0.151.0`。 - 隔离边界:容器方案使用独立 `deploy/container/api-server.env`、独立 Nginx 配置、独立 compose 命令和默认 `18080` 端口;真实 token 不进入镜像、不提交 Git;生产 systemd 单元、Jenkins 发布脚本和 `deploy/nginx/` 模板仍是正式线上来源。 +- 生产 Collector:server-provision 可安装 `otelcol-contrib.service` 和本机 debug exporter 配置,但二进制由 Jenkins 构建机先准备 `provision-tools/otelcol-contrib` 再上传到 release 部署 agent,目标机不从 GitHub 下载;api-server 是否发送 OTLP 仍由 `GENARRATIVE_OTEL_ENABLED` 控制。 - 影响范围:`deploy/container/`、`scripts/container-compose.mjs`、`package.json` 容器命令、开发运维文档和容器 build context 排除规则。 - 验证方式:执行 `npm run container:config` 展开 compose 配置;需要真实运行时再执行 `npm run container:build`、`npm run container:up`、`npm run container:k6`,并结合容器 Nginx log 与 OTLP debug exporter 判断瓶颈。 - 关联文档:`deploy/container/README.md`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 +## 2026-05-18 生产 provision 改为构建机准备工具包再上传安装 + +- 背景:目标 release 服务器无法访问 GitHub,之前的 server provision 默认仍假设 `spacetime` 和 `otelcol-contrib` 已经存在于目标机本地路径,和真实运维条件不符。 +- 决策:Jenkins 新增 `Prepare Provision Tools` 阶段,在 `linux && genarrative-build` 构建机执行 `scripts/prepare-server-provision-tools.sh`,通过官方 SpacetimeDB 安装入口和 OpenTelemetry release 包生成 `provision-tools/`,再用 `stash/unstash` 带到 release 部署 agent;`scripts/jenkins-server-provision.sh` 只从工作区工具包复制安装,不再要求目标机自己下载或预装二进制。 +- 影响范围:`jenkins/Jenkinsfile.production-server-provision`、`scripts/prepare-server-provision-tools.sh`、`scripts/jenkins-server-provision.sh`、生产运维文档。 +- 验证方式:Jenkins 构建机可完成工具包准备,release 部署 agent 只消费工作区文件;目标机不再依赖 GitHub 外网下载。 +- 关联文档:`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 + ## 2026-05-16 公开作品列表短期由 BFF 订阅读模型缓存 - 背景:作品列表压测和实时性讨论中,曾考虑让浏览器前端直接订阅公开作品列表,减少 HTTP 拉取和 BFF 压力。 @@ -35,8 +45,6 @@ - 验证方式:新增公开作品列表订阅能力时,检查前端只消费专用 public read model 或 BFF HTTP DTO;检查源表 row shape、权限判断和跨玩法聚合没有下沉到前端页面。 - 关联文档:`docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 -## 2026-05-16 api-server OpenTelemetry 统一补齐 traces metrics logs - - 背景:压测与运行观测需要把 HTTP、SpacetimeDB 调用和应用日志串起来,同时保留本地 `journalctl` / 文件日志做故障排障。 - 决策:`api-server` 通过 OTLP HTTP base endpoint 发送 traces、metrics 和 logs;Collector 统一用 `otelcol-contrib`,`npm run otel:debug` 负责 debug 采集,`npm run otel:rider` 负责转发到 Rider;Rider 只是接收与可视化端,不直接替代 Collector。 - 日志口径:Rider Logs 面板只展示 log event 自身字段,请求完成日志需要直接携带 `request_id`、HTTP method、规范化 route、scheme、path、status、status_class、latency 和 slow_request;更完整的 request attributes 仍以 trace/span 为准。 diff --git a/deploy/container/README.md b/deploy/container/README.md index c9eb84c5..dfb7fde4 100644 --- a/deploy/container/README.md +++ b/deploy/container/README.md @@ -6,20 +6,27 @@ ```text Docker Compose +├─ spacetimedb :3101,独立数据卷,供 api-server 连接 ├─ nginx :80 -> api-server:8082,负责静态站点、/admin/、/api/ 反代、upstream timing log、连接限制 -├─ api-server :8082,Linux release 构建,连接外部 SpacetimeDB +├─ api-server :8082,Linux release 构建,连接 compose 内 SpacetimeDB ├─ otelcol :4317/4318,debug exporter,接收 traces / metrics / logs └─ k6 profile=loadtest 时临时启动,在 compose 网络内压 nginx ``` +当前容器模拟参数按 `genarrative-release` 服务器采样值收口为 2 vCPU / 2 GiB RAM / 4096 soft nofile / 768 worker_connections,并已在 compose 里落实到 `spacetimedb cpus=1.0 mem_limit=768m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.25 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=0.5 mem_limit=512m`。 +Collector 镜像使用 `otel/opentelemetry-collector-contrib:0.151.0`。 +生产服务器若启用 Collector,则由 `deploy/systemd/otelcol-contrib.service` 和 `deploy/otelcol/genarrative-debug.yaml` 托管,不走容器镜像。 + 默认 host 端口: +- `http://127.0.0.1:13101`:容器 SpacetimeDB。 - `http://127.0.0.1:18080`:容器 Nginx。 - `127.0.0.1:4317` / `127.0.0.1:4318`:容器 Collector OTLP gRPC / HTTP。 如端口冲突,可设置: ```powershell +$env:GENARRATIVE_CONTAINER_SPACETIME_PORT="13102" $env:GENARRATIVE_CONTAINER_HTTP_PORT="18081" $env:GENARRATIVE_CONTAINER_OTLP_HTTP_PORT="14318" $env:GENARRATIVE_CONTAINER_OTLP_GRPC_PORT="14317" @@ -33,21 +40,25 @@ npm run container:init 该命令会从 `deploy/container/api-server.env.example` 生成本地 `deploy/container/api-server.env`。真实 token、库名和外部服务密钥只写本地 env 文件,不提交 Git。 -Docker Desktop 下默认通过 `host.docker.internal:3101` 连接宿主机上 `npm run dev` 启动的 SpacetimeDB: +Docker Desktop 下默认通过 `http://spacetimedb:3101` 连接 compose 内 SpacetimeDB;宿主机只负责用 CLI 发布模块: ```env -GENARRATIVE_SPACETIME_SERVER_URL=http://host.docker.internal:3101 +GENARRATIVE_SPACETIME_SERVER_URL=http://spacetimedb:3101 GENARRATIVE_SPACETIME_DATABASE=genarrative-loadtest GENARRATIVE_SPACETIME_TOKEN= ``` -Linux Docker Engine 如果不能解析 `host.docker.internal`,Compose 已配置 `host-gateway`;仍不通时把 `GENARRATIVE_SPACETIME_SERVER_URL` 改成宿主机网关 IP 或同网络内的 SpacetimeDB 地址。 +宿主机发布模块时,先用 CLI 向 `http://127.0.0.1:13101` 发布到 `genarrative-loadtest`,再启动 `npm run container:up`。 + +Linux Docker Engine 若要从宿主机 CLI 连到容器内服务,直接用 `http://127.0.0.1:13101`;容器内部服务之间统一走 `http://spacetimedb:3101`。 ## 启动与验证 ```bash npm run container:config npm run container:build +npm run container:up -- spacetimedb +spacetime publish genarrative-loadtest --server http://127.0.0.1:13101 --module-path server-rs/crates/spacetime-module --yes --build-options="--debug" npm run container:up npm run container:ps curl -sS http://127.0.0.1:18080/api/runtime/puzzle/gallery @@ -103,6 +114,17 @@ $env:DETAIL_RATIO="0" npm run container:k6 ``` +容器内 `api-server` 资源上限与 Nginx 连接模型已经按 `genarrative-release` 的 2C / 2G / `nofile=4096` / `worker_connections=768` 收口;如果你要改成别的机器,就先重新采样再改这里。 + +SpacetimeDB 容器默认只提供运行时,不自动发布模块。首次启动或清理 `spacetime-data` 卷后,先只启动 `spacetimedb` 服务,再发布模块: + +```bash +npm run container:up -- spacetimedb +spacetime publish genarrative-loadtest --server http://127.0.0.1:13101 --module-path server-rs/crates/spacetime-module --yes --build-options="--debug" +``` + +发布完成后再执行 `npm run container:up` 和 `npm run container:k6`。如果 `deploy/container/api-server.env` 里的 `GENARRATIVE_SPACETIME_DATABASE` 改成了别的库名,发布命令里的库名也要同步修改。 + 如果要压 1000 HTTP req/s,把 `PEAK_RPS` 调到 `500`;如果要压 5000 HTTP req/s,把 `PEAK_RPS` 调到 `2500`,并同时提高 `PREALLOCATED_VUS` / `MAX_VUS`,观察是否先被带宽、Nginx `limit_conn` 或 api-server 背压限制。 ## OTLP diff --git a/deploy/container/api-server.env.example b/deploy/container/api-server.env.example index ad4ff549..c66e66ef 100644 --- a/deploy/container/api-server.env.example +++ b/deploy/container/api-server.env.example @@ -7,7 +7,7 @@ GENARRATIVE_API_HOST=0.0.0.0 GENARRATIVE_API_PORT=8082 GENARRATIVE_API_LOG=info,tower_http=info GENARRATIVE_API_LISTEN_BACKLOG=1024 -GENARRATIVE_API_WORKER_THREADS=4 +GENARRATIVE_API_WORKER_THREADS=2 GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512 GENARRATIVE_OTEL_ENABLED=false @@ -21,9 +21,8 @@ GENARRATIVE_JWT_SECRET=CHANGE_ME_FOR_CONTAINER AUTH_REFRESH_COOKIE_SECURE=false GENARRATIVE_AUTH_STORE_PATH=/var/lib/genarrative/auth/auth-store.json -# Docker Desktop 下连接宿主机 npm run dev 启动的 SpacetimeDB。 -# Linux Docker Engine 可改成宿主机网关 IP,或在 compose 里接入同一网络内的 SpacetimeDB。 -GENARRATIVE_SPACETIME_SERVER_URL=http://host.docker.internal:3101 +# 默认连接 compose 内部 SpacetimeDB;宿主机发布模块使用 127.0.0.1:13101。 +GENARRATIVE_SPACETIME_SERVER_URL=http://spacetimedb:3101 GENARRATIVE_SPACETIME_DATABASE=genarrative-loadtest GENARRATIVE_SPACETIME_TOKEN= GENARRATIVE_SPACETIME_POOL_SIZE=8 diff --git a/deploy/container/docker-compose.loadtest.yml b/deploy/container/docker-compose.loadtest.yml index 2450e6ec..a6d3d5e4 100644 --- a/deploy/container/docker-compose.loadtest.yml +++ b/deploy/container/docker-compose.loadtest.yml @@ -1,11 +1,47 @@ name: genarrative-container-loadtest services: + spacetimedb: + image: clockworklabs/spacetime:v2.2.0 + command: + [ + "start", + "--listen-addr", + "0.0.0.0:3101", + "--data-dir", + "/var/lib/spacetimedb", + "--page_pool_max_size", + "536870912", + "--non-interactive", + ] + cpus: "1.0" + mem_limit: 768m + ports: + - "${GENARRATIVE_CONTAINER_SPACETIME_PORT:-13101}:3101" + volumes: + - spacetime-data:/var/lib/spacetimedb + ulimits: + nofile: + soft: 4096 + hard: 4096 + healthcheck: + test: + [ + "CMD-SHELL", + "spacetime server ping http://127.0.0.1:3101 >/dev/null 2>&1", + ] + interval: 10s + timeout: 5s + retries: 12 + start_period: 20s + api-server: build: context: ../.. dockerfile: deploy/container/api-server.Dockerfile target: api-runtime + cpus: "2.0" + mem_limit: 1g env_file: - ./api-server.env environment: @@ -16,7 +52,13 @@ services: - "host.docker.internal:host-gateway" volumes: - api-auth-store:/var/lib/genarrative/auth + ulimits: + nofile: + soft: 4096 + hard: 4096 depends_on: + spacetimedb: + condition: service_healthy otelcol: condition: service_started healthcheck: @@ -31,15 +73,23 @@ services: context: ../.. dockerfile: deploy/container/api-server.Dockerfile target: nginx-runtime + cpus: "0.25" + mem_limit: 128m depends_on: api-server: condition: service_healthy + spacetimedb: + condition: service_healthy ports: - "${GENARRATIVE_CONTAINER_HTTP_PORT:-18080}:80" extra_hosts: - "host.docker.internal:host-gateway" volumes: - nginx-logs:/var/log/nginx + ulimits: + nofile: + soft: 4096 + hard: 4096 healthcheck: test: ["CMD", "wget", "-qO-", "http://127.0.0.1/api/runtime/puzzle/gallery"] interval: 10s @@ -48,8 +98,10 @@ services: start_period: 20s otelcol: - image: otel/opentelemetry-collector-contrib:0.125.0 + image: otel/opentelemetry-collector-contrib:0.151.0 command: ["--config=/etc/otelcol/config.yaml"] + cpus: "0.25" + mem_limit: 128m volumes: - ./otelcol.yaml:/etc/otelcol/config.yaml:ro ports: @@ -59,6 +111,8 @@ services: k6: image: grafana/k6:0.52.0 profiles: ["loadtest"] + cpus: "0.5" + mem_limit: 512m depends_on: nginx: condition: service_healthy @@ -81,5 +135,6 @@ services: command: ["run", "k6-works-list.js"] volumes: + spacetime-data: api-auth-store: nginx-logs: diff --git a/deploy/container/nginx.conf b/deploy/container/nginx.conf index ae274c96..6e6d1094 100644 --- a/deploy/container/nginx.conf +++ b/deploy/container/nginx.conf @@ -1,7 +1,7 @@ worker_processes auto; events { - worker_connections 4096; + worker_connections 768; } http { @@ -106,7 +106,7 @@ http { } location ~ ^/v1/database/[^/]+/subscribe$ { - proxy_pass http://host.docker.internal:3101; + proxy_pass http://spacetimedb:3101; proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "Upgrade"; @@ -115,7 +115,7 @@ http { } location ^~ /v1/identity { - proxy_pass http://host.docker.internal:3101; + proxy_pass http://spacetimedb:3101; proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "Upgrade"; diff --git a/deploy/otelcol/genarrative-debug.yaml b/deploy/otelcol/genarrative-debug.yaml new file mode 100644 index 00000000..216a591b --- /dev/null +++ b/deploy/otelcol/genarrative-debug.yaml @@ -0,0 +1,23 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 127.0.0.1:4317 + http: + endpoint: 127.0.0.1:4318 + +exporters: + debug: + verbosity: normal + +service: + pipelines: + traces: + receivers: [otlp] + exporters: [debug] + metrics: + receivers: [otlp] + exporters: [debug] + logs: + receivers: [otlp] + exporters: [debug] diff --git a/deploy/systemd/otelcol-contrib.service b/deploy/systemd/otelcol-contrib.service new file mode 100644 index 00000000..ad891f02 --- /dev/null +++ b/deploy/systemd/otelcol-contrib.service @@ -0,0 +1,22 @@ +[Unit] +Description=Genarrative OpenTelemetry Collector Contrib +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=otelcol +Group=otelcol +WorkingDirectory=/etc/otelcol +ExecStart=/usr/local/bin/otelcol-contrib --config=/etc/otelcol/genarrative-debug.yaml +Restart=always +RestartSec=5 +LimitNOFILE=65535 + +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=full +ReadWritePaths=/etc/otelcol /var/log/genarrative + +[Install] +WantedBy=multi-user.target diff --git a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md index f033543e..2ab0e347 100644 --- a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md +++ b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md @@ -156,12 +156,14 @@ Jenkins 按 web / api / Spacetime module / build / deploy / publish 拆分 - `api-server` 生产模板默认 `GENARRATIVE_API_LISTEN_BACKLOG=1024`、`GENARRATIVE_API_WORKER_THREADS=4`;本地未设置 worker threads 时继续使用 Tokio 默认值。 - `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512` 开启应用内 HTTP 并发背压,超过并发许可时直接返回 `429 Too Many Requests` 和 `Retry-After: 1`,`/healthz` 不受该限制。该值不是 RPS 限速;如果压测中 429 上升但内存和 p95 收敛,说明背压正在保护进程,需要结合真实容量调阈值或在 Nginx 前置限流。直连 `api-server` 的极高 RPS 压测若出现 `connection refused`,通常已经打到 TCP 监听 / accept 层,应同时检查 backlog、Nginx upstream keepalive 和前置限流。 - `genarrative-api.service` 设置 `LimitNOFILE=65535`、`TasksMax=2048`;上线后用 `systemctl show genarrative-api.service -p LimitNOFILE -p TasksMax` 和 `cat /proc/$(pidof api-server)/limits` 核对。 +- Server provision 不在目标机下载 SpacetimeDB 或 `otelcol-contrib`。Jenkins 的 `Prepare Provision Tools` 阶段在 `linux && genarrative-build` 构建机执行 `scripts/prepare-server-provision-tools.sh`,通过官方 SpacetimeDB 安装入口 `https://install.spacetimedb.com` 和 OpenTelemetry release 包生成 `provision-tools/`,再通过 `stash/unstash` 上传到 release 部署 agent。目标机上的 `scripts/jenkins-server-provision.sh` 只从该工作区工具包安装 `/stdb/spacetime`、`/stdb/bin/current/*` 和 `/usr/local/bin/otelcol-contrib`。 +- `otelcol-contrib.service` 作为可选系统服务加入 provision,默认监听 `127.0.0.1:4317/4318` 并使用 `deploy/otelcol/genarrative-debug.yaml`。api-server 是否发送 OTLP 仍由 `GENARRATIVE_OTEL_ENABLED` 控制,服务 unit 见 `deploy/systemd/otelcol-contrib.service`。 - Nginx `/api/` 与 `/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`,upstream keepalive 为 64;压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time`、`upstream_connect_time`、`upstream_header_time`、`upstream_response_time`、`upstream_status`、`request_id`。 - 作品列表 K6 脚本一次 iteration 默认请求两个公开接口,因此约 50 HTTP req/s 的目标命令使用 `SCENARIO=spike START_RPS=5 PEAK_RPS=25 HOLD=60s END_RPS=5 DETAIL_RATIO=0 npm run loadtest:k6:works`。 - 作品列表短期继续由 `api-server` / BFF 订阅 SpacetimeDB 公开 read model 后读本地 cache,不让浏览器前端直接订阅完整列表;未来如新增 `public_work_gallery_entry` 等专用公开作品列表 read model,前端只可订阅稳定、低基数、公开的专用投影,禁止订阅 `puzzle_work_profile`、`custom_world_profile` 等玩法源表后自行 join、聚合或判断权限。前端直订阅落地前必须先补齐权限、字段契约、排序 / 分页、埋点和 BFF 回退策略。 - 50 HTTP req/s 验收目标为 `http_req_failed < 1%`、`p95 < 2s`、`dropped_iterations = 0`,同时压测窗口内 Nginx 无新增 502。 -容器化压测与隔离部署方案单独放在 `deploy/container/`,用于本机或预发模拟 Linux release + Nginx + OTLP Collector 拓扑,不替换当前生产 `systemd + Nginx + Jenkins` 发布路径: +容器化压测与隔离部署方案单独放在 `deploy/container/`,用于本机或预发模拟 Linux release + Nginx + OTLP Collector 拓扑,不替换当前生产 `systemd + Nginx + Jenkins` 发布路径。当前容器模拟参数按 `genarrative-release` 采样值收口为 2 vCPU / 2 GiB RAM / `nofile=4096` / `worker_connections=768`,并在 compose 里落实到 `spacetimedb cpus=1.0 mem_limit=768m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.25 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=0.5 mem_limit=512m`: ```bash npm run container:init @@ -172,7 +174,7 @@ npm run container:k6 npm run container:down ``` -容器方案默认暴露 `http://127.0.0.1:18080`,`api-server` 在容器内监听 `0.0.0.0:8082`,Nginx 通过 `api-server:8082` upstream 反代 `/api/` 和 `/admin/api/`。SpacetimeDB 默认仍连接宿主机 `http://host.docker.internal:3101`,真实库名、token 和外部服务密钥只写本地 `deploy/container/api-server.env`,不提交 Git。完整拓扑、端口、k6 参数和 OTLP debug exporter 使用方法见 `deploy/container/README.md`。 +容器方案默认暴露 `http://127.0.0.1:18080`,`api-server` 在容器内监听 `0.0.0.0:8082`,Nginx 通过 `api-server:8082` upstream 反代 `/api/` 和 `/admin/api/`。SpacetimeDB 也纳入 compose,容器内由 `spacetimedb:3101` 提供服务,宿主机通过 `http://127.0.0.1:13101` 进行模块发布;Collector 镜像使用 `otel/opentelemetry-collector-contrib:0.151.0`。生产 provision 侧则通过 Jenkins 构建机准备的 `provision-tools/otelcol-contrib` 安装本机 `otelcol-contrib.service`,真实库名、token 和外部服务密钥只写本地 `deploy/container/api-server.env`,不提交 Git。完整拓扑、端口、k6 参数和 OTLP debug exporter 使用方法见 `deploy/container/README.md`。 `npm run container:config` 默认只做 quiet 校验,避免把本地 env 中的 token 展开到终端;确需排查完整 compose 时再传 `-- --print`。 OpenTelemetry 现阶段可选 OTLP traces / metrics / logs,但本地日志与 Nginx 文件日志仍保留: diff --git a/jenkins/Jenkinsfile.production-server-provision b/jenkins/Jenkinsfile.production-server-provision index ee5d13d5..0b8a5e2d 100644 --- a/jenkins/Jenkinsfile.production-server-provision +++ b/jenkins/Jenkinsfile.production-server-provision @@ -22,7 +22,8 @@ pipeline { string(name: 'COMMIT_HASH', defaultValue: '', description: '部署脚本来源 commit') string(name: 'SERVER_NAME', defaultValue: 'genarrative.example.com', description: '证书主域名;也作为 Nginx server_name 的第一个域名') string(name: 'SERVER_ALIASES', defaultValue: '', description: '可选,额外 Nginx server_name,多个用空格或逗号分隔,例如 www.genarrative.world') - string(name: 'SPACETIME_BIN_SOURCE', defaultValue: '/usr/local/bin/spacetime', description: '服务器上已有 spacetime CLI 路径') + string(name: 'PROVISION_TOOLS_DIR', defaultValue: 'provision-tools', description: '构建机准备并上传到目标机工作区的工具包目录') + string(name: 'SPACETIME_DOWNLOAD_ROOT', defaultValue: 'https://github.com/clockworklabs/SpacetimeDB/releases/latest/download', description: '构建机下载 SpacetimeDB 官方安装产物的根地址;目标机不访问该地址') string(name: 'SPACETIME_ROOT', defaultValue: '/stdb', description: 'SpacetimeDB root-dir') string(name: 'RELEASE_ROOT', defaultValue: '/opt/genarrative/releases', description: 'release 根目录') string(name: 'CURRENT_LINK', defaultValue: '/opt/genarrative/current', description: '当前版本软链接') @@ -31,6 +32,8 @@ pipeline { string(name: 'API_PORT', defaultValue: '8082', description: 'api-server 本机监听端口') choice(name: 'NGINX_CONFIG_MODE', choices: ['none', 'production-https', 'development-http'], description: 'Nginx 配置模式;开发服无域名时选 development-http,release 正式入口选 production-https') booleanParam(name: 'ENABLE_SERVICES', defaultValue: true, description: '启用并启动 spacetimedb 与 api-server systemd 服务') + booleanParam(name: 'ENABLE_OTELCOL', defaultValue: true, description: '安装并启用本机 OpenTelemetry Collector;api-server 是否发送 OTLP 仍由环境变量控制') + string(name: 'OTELCOL_VERSION', defaultValue: '0.151.0', description: 'otelcol-contrib 版本') } stages { @@ -60,8 +63,17 @@ pipeline { } } } - if (!params.SPACETIME_BIN_SOURCE?.trim()) { - error('SPACETIME_BIN_SOURCE 不能为空。') + if (!params.PROVISION_TOOLS_DIR?.trim()) { + error('PROVISION_TOOLS_DIR 不能为空。') + } + if (!(params.PROVISION_TOOLS_DIR.trim() ==~ /^[0-9A-Za-z._/-]+$/) || params.PROVISION_TOOLS_DIR.startsWith('/') || params.PROVISION_TOOLS_DIR.contains('..')) { + error("PROVISION_TOOLS_DIR 只能是工作区内的相对目录,不能包含绝对路径或连续点号: ${params.PROVISION_TOOLS_DIR}") + } + if (!(params.OTELCOL_VERSION?.trim() ==~ /^[0-9]+\.[0-9]+\.[0-9]+$/)) { + error("OTELCOL_VERSION 格式应为 x.y.z: ${params.OTELCOL_VERSION}") + } + if (!params.SPACETIME_DOWNLOAD_ROOT?.trim()) { + error('SPACETIME_DOWNLOAD_ROOT 不能为空。') } def nginxMode = params.NGINX_CONFIG_MODE?.trim() if (!(nginxMode in ['none', 'production-https', 'development-http'])) { @@ -77,6 +89,58 @@ pipeline { } } + stage('Prepare Provision Tools') { + agent { + label 'linux && genarrative-build' + } + steps { + script { + def checkoutFromRemote = { String remoteUrl -> + checkout([ + $class: 'GitSCM', + branches: [[name: "*/${params.SOURCE_BRANCH}"]], + doGenerateSubmoduleConfigurations: false, + extensions: [ + [$class: 'CleanBeforeCheckout'], + [$class: 'CloneOption', shallow: true, depth: 1, noTags: true, timeout: 30, honorRefspec: true], + ], + userRemoteConfigs: [[url: remoteUrl, refspec: "+refs/heads/${params.SOURCE_BRANCH}:refs/remotes/origin/${params.SOURCE_BRANCH}"]], + ]) + } + try { + checkoutFromRemote(env.GIT_REMOTE_URL) + env.EFFECTIVE_GIT_REMOTE_URL = env.GIT_REMOTE_URL + } catch (error) { + echo "Git 主地址拉取失败: ${env.GIT_REMOTE_URL},改用备用地址: ${env.GIT_REMOTE_FALLBACK_URL}" + checkoutFromRemote(env.GIT_REMOTE_FALLBACK_URL) + env.EFFECTIVE_GIT_REMOTE_URL = env.GIT_REMOTE_FALLBACK_URL + } + } + sh ''' + bash <<'BASH' + set -euo pipefail + chmod +x scripts/jenkins-checkout-source.sh scripts/prepare-server-provision-tools.sh + SOURCE_BRANCH="${SOURCE_BRANCH:-master}" \ + COMMIT_HASH="${COMMIT_HASH:-}" \ + GIT_REMOTE_URL="${EFFECTIVE_GIT_REMOTE_URL:-${GIT_REMOTE_URL}}" \ + GIT_REMOTE_FALLBACK_URL="${GIT_REMOTE_FALLBACK_URL:-}" \ + SOURCE_COMMIT_FILE=".jenkins-source-commit" \ + scripts/jenkins-checkout-source.sh + + PROVISION_TOOLS_DIR="${PROVISION_TOOLS_DIR:-provision-tools}" \ + OTELCOL_VERSION="${OTELCOL_VERSION:-0.151.0}" \ + SPACETIME_DOWNLOAD_ROOT="${SPACETIME_DOWNLOAD_ROOT:-https://github.com/clockworklabs/SpacetimeDB/releases/latest/download}" \ + scripts/prepare-server-provision-tools.sh +BASH + ''' + script { + env.SOURCE_COMMIT = readFile('.jenkins-source-commit').trim() + echo "Provision 工具包已准备,源码 commit=${env.SOURCE_COMMIT}" + } + stash name: 'server-provision-tools', includes: "${params.PROVISION_TOOLS_DIR}/**", useDefaultExcludes: false + } + } + stage('Checkout Provision Files') { agent { label "${params.DEPLOY_TARGET == 'development' ? 'linux && genarrative-build' : 'linux && genarrative-release-deploy'}" @@ -109,7 +173,7 @@ pipeline { set -euo pipefail chmod +x scripts/jenkins-checkout-source.sh SOURCE_BRANCH="${SOURCE_BRANCH:-master}" \ - COMMIT_HASH="${COMMIT_HASH:-}" \ + COMMIT_HASH="${COMMIT_HASH:-${SOURCE_COMMIT:-}}" \ GIT_REMOTE_URL="${EFFECTIVE_GIT_REMOTE_URL:-${GIT_REMOTE_URL}}" \ GIT_REMOTE_FALLBACK_URL="${GIT_REMOTE_FALLBACK_URL:-}" \ SOURCE_COMMIT_FILE=".jenkins-source-commit" \ @@ -124,10 +188,18 @@ BASH label "${params.DEPLOY_TARGET == 'development' ? 'linux && genarrative-build' : 'linux && genarrative-release-deploy'}" } steps { + unstash 'server-provision-tools' sh ''' bash <<'BASH' set -euo pipefail + chmod +x "${PROVISION_TOOLS_DIR:-provision-tools}/otelcol-contrib" \ + "${PROVISION_TOOLS_DIR:-provision-tools}/spacetime/spacetime" \ + "${PROVISION_TOOLS_DIR:-provision-tools}/spacetime/bin/current/spacetimedb-cli" \ + "${PROVISION_TOOLS_DIR:-provision-tools}/spacetime/bin/current/spacetimedb-standalone" chmod +x scripts/jenkins-server-provision.sh + PROVISION_TOOLS_DIR="${PROVISION_TOOLS_DIR:-provision-tools}" \ + SPACETIME_BIN_SOURCE="${PROVISION_TOOLS_DIR:-provision-tools}/spacetime/spacetime" \ + OTELCOL_BIN_SOURCE="${PROVISION_TOOLS_DIR:-provision-tools}/otelcol-contrib" \ scripts/jenkins-server-provision.sh BASH ''' diff --git a/scripts/container-compose.mjs b/scripts/container-compose.mjs index 0ee92af5..35a4bed9 100644 --- a/scripts/container-compose.mjs +++ b/scripts/container-compose.mjs @@ -89,7 +89,7 @@ function printHelp(isError) { Commands: container:init 生成 deploy/container/api-server.env container:build 构建 api-server 容器镜像 - container:up 后台启动 api-server + nginx + otelcol + container:up 后台启动 spacetimedb + api-server + nginx + otelcol container:down 停止并清理容器 container:logs 查看容器日志 container:ps 查看容器状态 diff --git a/scripts/jenkins-server-provision.sh b/scripts/jenkins-server-provision.sh index 203518d4..bbabf2bd 100755 --- a/scripts/jenkins-server-provision.sh +++ b/scripts/jenkins-server-provision.sh @@ -1,6 +1,24 @@ #!/usr/bin/env bash set -euo pipefail +PROVISION_TOOLS_DIR="${PROVISION_TOOLS_DIR:-provision-tools}" +SPACETIME_BIN_SOURCE="${SPACETIME_BIN_SOURCE:-${PROVISION_TOOLS_DIR}/spacetime/spacetime}" +OTELCOL_BIN_SOURCE="${OTELCOL_BIN_SOURCE:-${PROVISION_TOOLS_DIR}/otelcol-contrib}" + +require_non_root_relative_path() { + local label="$1" + local path="$2" + + if [[ -z "${path}" ]]; then + echo "[server-provision] ${label} 不能为空。" >&2 + exit 1 + fi + if [[ "${path}" == /* || "${path}" == *..* ]]; then + echo "[server-provision] ${label} 只能是工作区内的相对路径: ${path}" >&2 + exit 1 + fi +} + require_path() { local path="$1" if [[ ! -e "${path}" ]]; then @@ -81,16 +99,16 @@ install_sccache() { fi echo "[server-provision] 未找到 sccache,准备通过 cargo install sccache 安装。" - if ! command -v cargo >/dev/null 2>&1; then - echo "[server-provision] 未找到 cargo,无法自动安装 sccache。请先安装 Rust 工具链后重跑 Server-Provision。" >&2 - exit 1 - fi - if [[ "${DRY_RUN}" == "true" ]]; then echo "+ cargo install sccache --locked" return fi + if ! command -v cargo >/dev/null 2>&1; then + echo "[server-provision] 未找到 cargo,无法自动安装 sccache。请先安装 Rust 工具链后重跑 Server-Provision。" >&2 + exit 1 + fi + cargo install sccache --locked if ! command -v sccache >/dev/null 2>&1 && [[ ! -x /root/.cargo/bin/sccache ]]; then echo "[server-provision] sccache 安装后仍不可用,请检查 cargo bin 目录是否在 PATH 中。" >&2 @@ -98,6 +116,42 @@ install_sccache() { fi } +sync_otelcol_install() { + local target_bin="/usr/local/bin/otelcol-contrib" + local source_bin="${OTELCOL_BIN_SOURCE}" + local version="${OTELCOL_VERSION:-0.151.0}" + local resolved_source="${source_bin}" + + if [[ "${ENABLE_OTELCOL:-true}" != "true" ]]; then + echo "[server-provision] ENABLE_OTELCOL=${ENABLE_OTELCOL:-},跳过 otelcol-contrib 配置。" + return + fi + + if command -v readlink >/dev/null 2>&1; then + resolved_source="$(readlink -f "${source_bin}" 2>/dev/null || echo "${source_bin}")" + fi + + if [[ ! -x "${resolved_source}" ]]; then + echo "[server-provision] otelcol-contrib 不存在或不可执行: ${source_bin}" >&2 + echo "[server-provision] 请先在构建机准备好 otelcol-contrib ${version},再通过 provision-tools 上传到目标机。" >&2 + exit 1 + fi + + if [[ "${DRY_RUN}" == "true" ]]; then + echo "+ install -m 0755 ${resolved_source} ${target_bin}" + return + fi + + install -m 0755 "${resolved_source}" "${target_bin}" + if ! "${target_bin}" --version >/dev/null 2>&1; then + echo "[server-provision] otelcol-contrib 安装后无法执行: ${target_bin}" >&2 + exit 1 + fi + if ! "${target_bin}" --version 2>/dev/null | grep -q "${version}"; then + echo "[server-provision] 警告: otelcol-contrib 版本不是期望的 ${version}: $("${target_bin}" --version 2>/dev/null || true)" >&2 + fi +} + sync_spacetime_install() { local root_dir="$1" local target_bin_dir="${root_dir}/bin/current" @@ -106,14 +160,6 @@ sync_spacetime_install() { local resolved_command="${SPACETIME_BIN_SOURCE}" local install_dir="" local root_bin="${root_dir}/bin" - local share_bin_dir="" - local version_dir="" - local parent_dir="" - - if [[ -x "${target_cli}" && -x "${target_standalone}" ]]; then - echo "[server-provision] SpacetimeDB current 目录已存在: ${target_bin_dir}" - return - fi echo "[server-provision] 同步 SpacetimeDB current 目录到 ${target_bin_dir}" if [[ "${DRY_RUN}" == "true" ]]; then @@ -128,26 +174,10 @@ sync_spacetime_install() { install_dir="$(cd -- "$(dirname -- "${resolved_command}")" && pwd)" mkdir -p "${root_bin}" - for share_bin_dir in \ - "/usr/.local/share/spacetime/bin" \ - "/root/.local/share/spacetime/bin" \ - "${HOME:-}/.local/share/spacetime/bin"; do - if [[ -d "${share_bin_dir}" ]]; then - version_dir="$(find "${share_bin_dir}" -mindepth 1 -maxdepth 1 -type d | sort -V | tail -n 1)" - if [[ -n "${version_dir}" && -x "${version_dir}/spacetimedb-cli" && -x "${version_dir}/spacetimedb-standalone" ]]; then - echo "[server-provision] 同步 SpacetimeDB 安装: ${version_dir} -> ${target_bin_dir}" - rm -rf "${target_bin_dir}" - mkdir -p "${target_bin_dir}" - cp -a "${version_dir}/." "${target_bin_dir}/" - chmod +x "${target_cli}" "${target_standalone}" - chown -R spacetimedb:spacetimedb "${root_bin}" - return - fi - fi - done - if [[ -d "${install_dir}/bin" ]]; then echo "[server-provision] 同步 SpacetimeDB 安装: ${install_dir}/bin -> ${root_bin}" + rm -rf "${root_bin}" + mkdir -p "${root_bin}" cp -a "${install_dir}/bin/." "${root_bin}/" elif [[ -x "${install_dir}/spacetimedb-cli" && -x "${install_dir}/spacetimedb-standalone" ]]; then echo "[server-provision] 同步 SpacetimeDB 安装: ${install_dir} -> ${target_bin_dir}" @@ -156,14 +186,8 @@ sync_spacetime_install() { cp -f "${install_dir}/spacetimedb-cli" "${target_cli}" cp -f "${install_dir}/spacetimedb-standalone" "${target_standalone}" chmod +x "${target_cli}" "${target_standalone}" - elif [[ -f "${resolved_command}" ]]; then - parent_dir="$(cd -- "${install_dir}/.." && pwd)" - if [[ -d "${parent_dir}/bin" && -x "${parent_dir}/bin/current/spacetimedb-cli" && -x "${parent_dir}/bin/current/spacetimedb-standalone" ]]; then - echo "[server-provision] 同步 SpacetimeDB 安装: ${parent_dir}/bin -> ${root_bin}" - cp -a "${parent_dir}/bin/." "${root_bin}/" - else - echo "[server-provision] 未能从 spacetime 命令路径推断完整 SpacetimeDB 安装目录: ${resolved_command}" >&2 - fi + else + echo "[server-provision] 未能从 SpacetimeDB 交付包推断完整安装目录: ${resolved_command}" >&2 fi if [[ ! -x "${target_cli}" || ! -x "${target_standalone}" ]]; then @@ -387,6 +411,10 @@ render_api_env_example() { deploy/env/api-server.env.example } +render_otelcol_service() { + cat deploy/systemd/otelcol-contrib.service +} + validate_nginx_tls() { local cert_dir="/etc/letsencrypt/live/${SERVER_NAME}" if [[ "${SERVER_NAME}" == "genarrative.example.com" ]]; then @@ -523,6 +551,8 @@ render_api_service() { require_path deploy/systemd/spacetimedb.service require_path deploy/systemd/genarrative-api.service +require_path deploy/systemd/otelcol-contrib.service +require_path deploy/otelcol/genarrative-debug.yaml require_path deploy/nginx/genarrative.conf require_path deploy/nginx/genarrative-dev-http.conf require_path deploy/nginx/snippets/genarrative-maintenance.conf @@ -532,6 +562,7 @@ require_path scripts/deploy/maintenance-off.sh require_path scripts/deploy/maintenance-status.sh validate_server_names +require_non_root_relative_path "PROVISION_TOOLS_DIR" "${PROVISION_TOOLS_DIR}" echo "[server-provision] target=${DEPLOY_TARGET}, dry_run=${DRY_RUN}, nginx_config_mode=${NGINX_CONFIG_MODE}, source_commit=$(cat .jenkins-source-commit)" @@ -585,6 +616,16 @@ else echo "[server-provision] 已存在环境文件,保留不覆盖: ${API_ENV_FILE}" fi +if [[ "${ENABLE_OTELCOL:-true}" == "true" ]]; then + sync_otelcol_install + otelcol_service="$(mktemp)" + render_otelcol_service >"${otelcol_service}" + install_file "${otelcol_service}" /etc/systemd/system/otelcol-contrib.service 0644 + rm -f "${otelcol_service}" +else + echo "[server-provision] ENABLE_OTELCOL=${ENABLE_OTELCOL:-},跳过 otelcol-contrib service 安装。" +fi + if [[ "${NGINX_CONFIG_MODE}" != "none" ]]; then install_nginx_config_with_rollback else @@ -593,7 +634,13 @@ fi run_cmd systemctl daemon-reload if [[ "${ENABLE_SERVICES}" == "true" ]]; then + if [[ "${ENABLE_OTELCOL:-true}" == "true" ]]; then + run_cmd systemctl enable otelcol-contrib.service + fi run_cmd systemctl enable spacetimedb.service genarrative-api.service + if [[ "${ENABLE_OTELCOL:-true}" == "true" ]]; then + run_cmd systemctl restart otelcol-contrib.service + fi run_cmd systemctl restart spacetimedb.service wait_for_spacetimedb_service ensure_spacetime_owner_client_token diff --git a/scripts/prepare-server-provision-tools.sh b/scripts/prepare-server-provision-tools.sh new file mode 100644 index 00000000..3d6a9830 --- /dev/null +++ b/scripts/prepare-server-provision-tools.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROVISION_TOOLS_DIR="${PROVISION_TOOLS_DIR:-provision-tools}" +OTELCOL_VERSION="${OTELCOL_VERSION:-0.151.0}" +OTELCOL_DOWNLOAD_ROOT="${OTELCOL_DOWNLOAD_ROOT:-https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download}" +SPACETIME_INSTALLER_URL="${SPACETIME_INSTALLER_URL:-https://install.spacetimedb.com}" +SPACETIME_DOWNLOAD_ROOT="${SPACETIME_DOWNLOAD_ROOT:-https://github.com/clockworklabs/SpacetimeDB/releases/latest/download}" +PROVISION_TOOLS_TMP_PARENT="${PROVISION_TOOLS_TMP_PARENT:-${WORKSPACE:-$(pwd)}/.tmp/server-provision-tools}" +TMP_DIR_TO_CLEAN="" + +cleanup_tmp_dir() { + if [[ -n "${TMP_DIR_TO_CLEAN}" ]]; then + rm -rf "${TMP_DIR_TO_CLEAN}" + fi +} + +require_cmd() { + local name="$1" + if ! command -v "${name}" >/dev/null 2>&1; then + echo "[prepare-provision-tools] 缺少命令: ${name}" >&2 + exit 1 + fi +} + +download_file() { + local url="$1" + local output="$2" + + if command -v curl >/dev/null 2>&1; then + curl -fsSL --retry 3 --retry-delay 2 "${url}" -o "${output}" + elif command -v wget >/dev/null 2>&1; then + wget -O "${output}" "${url}" + else + echo "[prepare-provision-tools] 需要 curl 或 wget 下载: ${url}" >&2 + exit 1 + fi +} + +make_spacetime_wrapper() { + local target="$1" + + cat >"${target}" <<'EOF' +#!/usr/bin/env sh +set -eu +SELF_DIR=$(CDPATH= cd -- "$(dirname -- "$0")" && pwd) +exec "$SELF_DIR/bin/current/spacetimedb-cli" "$@" +EOF + chmod 0755 "${target}" +} + +prepare_otelcol() { + local tmp_dir="$1" + local archive="${tmp_dir}/otelcol-contrib.tar.gz" + local extract_dir="${tmp_dir}/otelcol-contrib" + local url="${OTELCOL_DOWNLOAD_ROOT}/v${OTELCOL_VERSION}/otelcol-contrib_${OTELCOL_VERSION}_linux_amd64.tar.gz" + local target="${PROVISION_TOOLS_DIR}/otelcol-contrib" + + require_cmd tar + + echo "[prepare-provision-tools] 下载 otelcol-contrib: ${url}" + mkdir -p "${extract_dir}" + download_file "${url}" "${archive}" + tar -xzf "${archive}" -C "${extract_dir}" + + if [[ ! -x "${extract_dir}/otelcol-contrib" ]]; then + echo "[prepare-provision-tools] otelcol-contrib 包中缺少可执行文件。" >&2 + exit 1 + fi + + install -m 0755 "${extract_dir}/otelcol-contrib" "${target}" + "${target}" --version >/dev/null +} + +prepare_spacetime() { + local tmp_dir="$1" + local install_root="${tmp_dir}/spacetime-root" + local target_dir="${PROVISION_TOOLS_DIR}/spacetime" + + echo "[prepare-provision-tools] 使用官方安装器准备 SpacetimeDB: ${SPACETIME_INSTALLER_URL}" + mkdir -p "${install_root}" + download_file "${SPACETIME_INSTALLER_URL}" "${tmp_dir}/spacetime-install.sh" + chmod 0755 "${tmp_dir}/spacetime-install.sh" + TMPDIR="${tmp_dir}" SPACETIME_DOWNLOAD_ROOT="${SPACETIME_DOWNLOAD_ROOT}" sh "${tmp_dir}/spacetime-install.sh" --root-dir "${install_root}" -y + + if [[ ! -x "${install_root}/bin/current/spacetimedb-cli" ]]; then + echo "[prepare-provision-tools] SpacetimeDB 安装结果缺少 bin/current/spacetimedb-cli。" >&2 + exit 1 + fi + if [[ ! -x "${install_root}/bin/current/spacetimedb-standalone" ]]; then + echo "[prepare-provision-tools] SpacetimeDB 安装结果缺少 bin/current/spacetimedb-standalone。" >&2 + exit 1 + fi + + mkdir -p "${target_dir}" + cp -a "${install_root}/bin" "${target_dir}/bin" + make_spacetime_wrapper "${target_dir}/spacetime" + + "${target_dir}/spacetime" --version >/dev/null +} + +main() { + local tmp_dir + + require_cmd chmod + require_cmd cp + require_cmd install + require_cmd mktemp + require_cmd rm + + mkdir -p "${PROVISION_TOOLS_TMP_PARENT}" + tmp_dir="$(mktemp -d "${PROVISION_TOOLS_TMP_PARENT%/}/run.XXXXXX")" + TMP_DIR_TO_CLEAN="${tmp_dir}" + trap cleanup_tmp_dir EXIT + + rm -rf "${PROVISION_TOOLS_DIR}" + mkdir -p "${PROVISION_TOOLS_DIR}" + + prepare_otelcol "${tmp_dir}" + prepare_spacetime "${tmp_dir}" + + cat >"${PROVISION_TOOLS_DIR}/MANIFEST.txt" < Date: Tue, 19 May 2026 01:00:33 +0800 Subject: [PATCH 2/6] perf(api-server): tune gallery load shedding --- .hermes/shared-memory/pitfalls.md | 24 ++ deploy/container/README.md | 21 +- deploy/container/api-server.Dockerfile | 3 +- deploy/container/api-server.env.example | 5 +- deploy/container/docker-compose.loadtest.yml | 9 +- deploy/container/nginx.conf | 87 ++++++ deploy/env/api-server.env.example | 3 + deploy/nginx/genarrative-dev-http.conf | 103 +++++++ deploy/nginx/genarrative.conf | 107 ++++++++ ...发运维】本地开发验证与生产运维-2026-05-15.md | 10 +- scripts/loadtest/README.md | 10 +- scripts/loadtest/data/works-list.sample.json | 12 +- scripts/loadtest/k6-works-list.js | 12 +- server-rs/crates/api-server/src/app.rs | 6 +- .../crates/api-server/src/backpressure.rs | 256 +++++++++++++++++- server-rs/crates/api-server/src/config.rs | 33 +++ .../crates/api-server/src/process_metrics.rs | 195 ++++++++++++- server-rs/crates/api-server/src/puzzle.rs | 59 +++- .../api-server/src/puzzle_gallery_cache.rs | 46 +++- server-rs/crates/api-server/src/state.rs | 122 ++++++++- server-rs/crates/api-server/src/telemetry.rs | 134 +++++++-- server-rs/crates/module-auth/src/lib.rs | 1 + 22 files changed, 1178 insertions(+), 80 deletions(-) diff --git a/.hermes/shared-memory/pitfalls.md b/.hermes/shared-memory/pitfalls.md index def8d06f..ccc90e04 100644 --- a/.hermes/shared-memory/pitfalls.md +++ b/.hermes/shared-memory/pitfalls.md @@ -107,6 +107,22 @@ - 验证:对照打 `/api/runtime/puzzle/gallery` 与 `/healthz`;对比 `PREALLOCATED_VUS=300 MAX_VUS=800` 和 `PREALLOCATED_VUS=20 MAX_VUS=40`;压测结束后继续采样 10 秒确认 private memory 回落。 - 关联:`scripts/loadtest/README.md`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`、`server-rs/crates/api-server/src/process_metrics.rs`、`server-rs/crates/api-server/src/telemetry.rs`。 +## 容器高 VU 下 `/healthz` RSS 尖峰先查 Axum state 深拷贝 + +- 现象:容器 Linux release `api-server` 打 `/healthz`,500 HTTP req/s、`PREALLOCATED_VUS=100` 只跑 1 秒也能把 RSS 推到约 1 GiB;同样问题与作品列表、SpacetimeDB procedure、业务 cache 和请求日志等级无关。 +- 原因:`AppState` 曾直接 `#[derive(Clone)]` 大结构体,里面包含配置、SpacetimeDB client、平台服务、认证服务和多组 cache。Axum/Hyper 会在 router/service/connection 路径频繁 clone state,高并发 keepalive 下会放大为状态深拷贝高水位。 +- 处理:`server-rs/crates/api-server/src/state.rs` 的 `AppState` 必须保持 `Arc` 浅拷贝壳;新增共享状态字段时放入 `AppStateInner`,不要把外层改回大结构体 clone。 +- 验证:用容器内 k6 直连 `api-server:8082/healthz`,500 HTTP req/s、`PREALLOCATED_VUS=100`、30 秒压测后采样 `/proc/$pid/status`、`/proc/$pid/smaps_rollup` 和 cgroup `memory.current/memory.peak`。2026-05-18 修复后结果为 `15001` 请求、`http_req_failed=0`、`dropped_iterations=0`,RSS 约 18 MiB -> 52 MiB,cgroup peak 约 47 MiB。 +- 关联:`server-rs/crates/api-server/src/state.rs`、`deploy/container/README.md`、`deploy/container/api-server.Dockerfile`。 + +## Gallery 压测延迟升高先查入口过量放行和 TTL 边界刷新 + +- 现象:公开作品列表在 500-1000 HTTP req/s 附近可能吞吐没有明显提升,但 p95 变高、VU 上升,甚至出现排队和 dropped iterations。 +- 原因:Nginx、Axum 和缓存刷新边界如果同时允许过多请求进入,压力会先堆在连接、service 和 cache rebuild 周围;这类延迟不等同于数据库连接池不足。 +- 处理:Nginx 按 endpoint 使用 `limit_req` 快拒绝,api-server 按 `default/gallery/detail/admin` 分组 semaphore 快拒绝;拼图广场 TTL 过期时已有缓存先返回 stale 响应,只允许一个后台 refresh 任务重建,冷启动无缓存时才同步构建。 +- 验证:OTLP 看 `genarrative.http.server.request_permits.available{pool=...}`、`genarrative.puzzle_gallery.cache.stale_hits`、`refreshes_started`、`refreshes_failed`,Nginx access log 看 `request_time` 与 `upstream_response_time` 是否同步收敛;超过容量时应明确 429,而不是长时间排队或新增 502。 +- 关联:`deploy/nginx/genarrative.conf`、`deploy/container/nginx.conf`、`server-rs/crates/api-server/src/backpressure.rs`、`server-rs/crates/api-server/src/puzzle_gallery_cache.rs`。 + ## 多玩法公开广场列表优先订阅 public view / read model - 现象:抓大鹅、方洞挑战、视觉小说、大鱼吃小鱼等公开列表如果沿用 `list_*_works` procedure,即使只读已发布作品,也会在每个 HTTP 请求里回到 SpacetimeDB WASM 侧扫描、反序列化配置并组装列表,50RPS 以上容易变成热点。 @@ -824,6 +840,14 @@ - 验证:执行 `cargo test -p api-server jsapi_order_request_sets_wechat_required_http_headers --manifest-path server-rs/Cargo.toml`。 - 关联:`server-rs/crates/api-server/src/wechat_pay.rs`、`docs/technical/MY_TAB_ACCOUNT_RECHARGE_IMPLEMENTATION_2026-04-25.md`。 +## 容器公开列表压测不要靠继续抬并发吃满 CPU + +- 现象:2C / 2G 容器压测公开 gallery list 时,`api-server` CPU 仍有余量,看起来像可以继续提高 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS` 或 Nginx `limit_conn`。 +- 原因:当前瓶颈不是 Tokio worker 线程数。`/api/runtime/puzzle/gallery` 和 `/api/runtime/custom-world-gallery` 成功响应后会走全局 route tracking,继续向 SpacetimeDB 写 `record_tracking_event_and_return`;入口并发从 320 抬到 336 / 352 时,SpacetimeDB 内存先逼近 `896m` 容器上限,200 请求 p95 变差,429 比例没有改善。 +- 处理:2C / 2G 容器模拟里公开 gallery list 暂以 `limit_conn=320`、`GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320` 作为稳定上限。若要继续提升吞吐,优先减少高频公开 GET 的 tracking 写入、做采样或改成批量/异步聚合;不要单纯放大入口并发。 +- 验证:宿主机 k6 打 `http://127.0.0.1:18080`,`PEAK_RPS=1000` 等价约 2000 HTTP req/s;320 档无 dropped iterations、无 5xx、无 OOM,200 请求 `request_time p95` 约 0.292s。336 / 352 档 p95 升到约 0.31s / 0.32s,SpacetimeDB 内存尾部可到约 `880MiB / 896MiB`。 +- 关联:`deploy/container/nginx.conf`、`deploy/container/api-server.env.example`、`deploy/container/README.md`、`server-rs/crates/api-server/src/tracking.rs`。 + ## 后台表查询展示 SpacetimeDB 枚举时不要套用 Option 解码 - 现象:后台“表查询”查看 `profile_recharge_order` 时,`kind` 和 `status` 显示为空数组 `[]`,例如充值订单原始行里 `points_60` 的类型和状态都不可读。 diff --git a/deploy/container/README.md b/deploy/container/README.md index dfb7fde4..31ce88a3 100644 --- a/deploy/container/README.md +++ b/deploy/container/README.md @@ -13,7 +13,8 @@ Docker Compose └─ k6 profile=loadtest 时临时启动,在 compose 网络内压 nginx ``` -当前容器模拟参数按 `genarrative-release` 服务器采样值收口为 2 vCPU / 2 GiB RAM / 4096 soft nofile / 768 worker_connections,并已在 compose 里落实到 `spacetimedb cpus=1.0 mem_limit=768m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.25 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=0.5 mem_limit=512m`。 +当前容器模拟参数按 `genarrative-release` 服务器采样值收口为 2 vCPU / 2 GiB RAM / 4096 soft nofile / 768 worker_connections,并已在 compose 里落实到 `spacetimedb cpus=1.0 mem_limit=896m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.5 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=1.0 mem_limit=512m`。SpacetimeDB 同时设置 `--page_pool_max_size=402653184`,给 reducer、订阅与运行时保留更多非 page pool 内存。 +容器 `api-server` 默认 `GENARRATIVE_API_WORKER_THREADS=4`,用于让 Tokio 在 2 vCPU 配额内有更多 I/O 调度 worker;该值不会突破 compose 里的 `cpus=2.0` CPU 上限。 Collector 镜像使用 `otel/opentelemetry-collector-contrib:0.151.0`。 生产服务器若启用 Collector,则由 `deploy/systemd/otelcol-contrib.service` 和 `deploy/otelcol/genarrative-debug.yaml` 托管,不走容器镜像。 @@ -52,6 +53,10 @@ GENARRATIVE_SPACETIME_TOKEN= Linux Docker Engine 若要从宿主机 CLI 连到容器内服务,直接用 `http://127.0.0.1:13101`;容器内部服务之间统一走 `http://spacetimedb:3101`。 +## 构建工具链 + +`api-server` 容器镜像只构建 Linux release API 二进制,不构建 `spacetime-module`。当前 `api-server -> spacetime-client -> spacetimedb-sdk 2.2.0` 依赖链要求 Rust 1.93,因此 `deploy/container/api-server.Dockerfile` 的 Rust builder 固定为 `rust:1.93-bookworm`。如果本机 Docker Hub 拉取失败,可以先在本机准备同名本地 builder 镜像,但不要把临时 bootstrap 容器或私有 registry 凭据写入仓库。 + ## 启动与验证 ```bash @@ -125,7 +130,19 @@ spacetime publish genarrative-loadtest --server http://127.0.0.1:13101 --module- 发布完成后再执行 `npm run container:up` 和 `npm run container:k6`。如果 `deploy/container/api-server.env` 里的 `GENARRATIVE_SPACETIME_DATABASE` 改成了别的库名,发布命令里的库名也要同步修改。 -如果要压 1000 HTTP req/s,把 `PEAK_RPS` 调到 `500`;如果要压 5000 HTTP req/s,把 `PEAK_RPS` 调到 `2500`,并同时提高 `PREALLOCATED_VUS` / `MAX_VUS`,观察是否先被带宽、Nginx `limit_conn` 或 api-server 背压限制。 +如果要压 1000 HTTP req/s,把 `PEAK_RPS` 调到 `500`;如果要压 5000 HTTP req/s,把 `PEAK_RPS` 调到 `2500`,并同时提高 `PREALLOCATED_VUS` / `MAX_VUS`,观察是否先被带宽、Nginx `limit_conn` / `limit_req` 或 api-server 分组背压限制。当前容器 Nginx 对公开 gallery list 使用 `genarrative_gallery_rps`,公开详情和普通 API 使用 `genarrative_api_rps`,后台 API 使用 `genarrative_admin_rps`;api-server 侧对应 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS`、`GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS` 和 `GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS`。 + +2026-05-19 的 2C / 2G 容器压测结论:公开 gallery list 的 `limit_conn=320` 与 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320` 是当前较稳的上限。用宿主机 k6 打 `http://127.0.0.1:18080`,`PEAK_RPS=1000` 等价于约 2000 HTTP req/s 的两接口组合压测;320 档无 dropped iterations、无 5xx、无 OOM,约 `151710` 个 200 与 `34310` 个 429,200 请求 `request_time p95=0.292s`。继续抬到 336 / 352 不会有效吃满 api-server CPU,反而让 200 数量减少、p95 升到约 0.31s / 0.32s,SpacetimeDB 内存尾部逼近 `880MiB / 896MiB`,下游内存先到危险区。当前不要为了降低“剩余 CPU”继续抬公开列表并发;下一步应减少成功列表请求后的 SpacetimeDB tracking 写入或优化下游状态,而不是放大入口并发。 + +### 内存采样 + +排查 API 容器内存时,优先对比压测前后的 `/proc/$pid/smaps_rollup` 和 cgroup 当前/峰值,不把 Windows 任务管理器总占用当成单进程结论: + +```bash +docker exec genarrative-container-loadtest-api-server-1 sh -c 'pid=$(pidof api-server); grep VmRSS /proc/$pid/status; grep RssAnon /proc/$pid/status; cat /proc/$pid/smaps_rollup | grep Anonymous; echo cgroup_current=$(cat /sys/fs/cgroup/memory.current); echo cgroup_peak=$(cat /sys/fs/cgroup/memory.peak)' +``` + +`/healthz` 也能复现的内存尖峰应先按连接层、service clone 或 allocator 高水位排查,不要直接归因到 SpacetimeDB procedure、作品列表 cache 或业务 DTO。2026-05-18 验证:`AppState` 改为 `Arc` 浅拷贝后,容器内直连 `api-server:8082/healthz` 的 500 HTTP req/s、`PREALLOCATED_VUS=100`、30 秒压测完成 `15001` 次请求,`http_req_failed=0`、`dropped_iterations=0`,API 进程 RSS 从约 18 MiB 升至约 52 MiB,cgroup 峰值约 47 MiB,未再出现 1 GiB 级尖峰。 ## OTLP diff --git a/deploy/container/api-server.Dockerfile b/deploy/container/api-server.Dockerfile index 5385b719..40897357 100644 --- a/deploy/container/api-server.Dockerfile +++ b/deploy/container/api-server.Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.88-bookworm AS rust-builder +FROM rust:1.93-bookworm AS rust-builder WORKDIR /workspace COPY server-rs ./server-rs @@ -36,6 +36,7 @@ COPY apps/admin-web/package.json ./apps/admin-web/package.json RUN npm ci COPY index.html metadata.json tsconfig.json vite.config.ts ./ +COPY scripts/vite-cli.mjs scripts/admin-web-build.mjs ./scripts/ COPY src ./src COPY public ./public COPY media ./media diff --git a/deploy/container/api-server.env.example b/deploy/container/api-server.env.example index c66e66ef..e2fad8c5 100644 --- a/deploy/container/api-server.env.example +++ b/deploy/container/api-server.env.example @@ -7,8 +7,11 @@ GENARRATIVE_API_HOST=0.0.0.0 GENARRATIVE_API_PORT=8082 GENARRATIVE_API_LOG=info,tower_http=info GENARRATIVE_API_LISTEN_BACKLOG=1024 -GENARRATIVE_API_WORKER_THREADS=2 +GENARRATIVE_API_WORKER_THREADS=4 GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512 +GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320 +GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=64 +GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16 GENARRATIVE_OTEL_ENABLED=false OTEL_SERVICE_NAME=genarrative-api diff --git a/deploy/container/docker-compose.loadtest.yml b/deploy/container/docker-compose.loadtest.yml index a6d3d5e4..29b6b73e 100644 --- a/deploy/container/docker-compose.loadtest.yml +++ b/deploy/container/docker-compose.loadtest.yml @@ -3,6 +3,7 @@ name: genarrative-container-loadtest services: spacetimedb: image: clockworklabs/spacetime:v2.2.0 + user: root command: [ "start", @@ -11,11 +12,11 @@ services: "--data-dir", "/var/lib/spacetimedb", "--page_pool_max_size", - "536870912", + "402653184", "--non-interactive", ] cpus: "1.0" - mem_limit: 768m + mem_limit: 896m ports: - "${GENARRATIVE_CONTAINER_SPACETIME_PORT:-13101}:3101" volumes: @@ -73,7 +74,7 @@ services: context: ../.. dockerfile: deploy/container/api-server.Dockerfile target: nginx-runtime - cpus: "0.25" + cpus: "0.5" mem_limit: 128m depends_on: api-server: @@ -111,7 +112,7 @@ services: k6: image: grafana/k6:0.52.0 profiles: ["loadtest"] - cpus: "0.5" + cpus: "1.0" mem_limit: 512m depends_on: nginx: diff --git a/deploy/container/nginx.conf b/deploy/container/nginx.conf index 6e6d1094..d6f19c9c 100644 --- a/deploy/container/nginx.conf +++ b/deploy/container/nginx.conf @@ -21,6 +21,9 @@ http { } limit_conn_zone $binary_remote_addr zone=genarrative_api_conn:10m; + limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=2400r/s; + limit_req_zone $binary_remote_addr zone=genarrative_api_rps:10m rate=300r/s; + limit_req_zone $binary_remote_addr zone=genarrative_admin_rps:10m rate=30r/s; sendfile on; keepalive_timeout 65; @@ -48,6 +51,8 @@ http { error_log /var/log/nginx/genarrative.error.log warn; limit_conn_status 429; limit_conn_log_level warn; + limit_req_status 429; + limit_req_log_level warn; root /srv/genarrative/web; index index.html; @@ -55,6 +60,7 @@ http { location ^~ /admin/api/ { default_type application/json; limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_admin_rps burst=16 nodelay; proxy_pass http://genarrative_api/admin/api/; proxy_http_version 1.1; @@ -82,9 +88,90 @@ http { try_files $uri =404; } + location = /api/runtime/puzzle/gallery { + default_type application/json; + limit_conn genarrative_api_conn 320; + limit_req zone=genarrative_gallery_rps burst=256 nodelay; + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location = /api/runtime/custom-world-gallery { + default_type application/json; + limit_conn genarrative_api_conn 320; + limit_req zone=genarrative_gallery_rps burst=256 nodelay; + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location ~ ^/api/runtime/puzzle/gallery/[^/]+$ { + default_type application/json; + limit_conn genarrative_api_conn 32; + limit_req zone=genarrative_api_rps burst=32 nodelay; + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location ~ ^/api/runtime/custom-world-gallery/[^/]+/[^/]+$ { + default_type application/json; + limit_conn genarrative_api_conn 32; + limit_req zone=genarrative_api_rps burst=32 nodelay; + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + location ~ ^/api(?:/|$) { default_type application/json; limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_api_rps burst=64 nodelay; proxy_pass http://genarrative_api; proxy_http_version 1.1; diff --git a/deploy/env/api-server.env.example b/deploy/env/api-server.env.example index c0c4763e..373f142d 100644 --- a/deploy/env/api-server.env.example +++ b/deploy/env/api-server.env.example @@ -8,6 +8,9 @@ GENARRATIVE_API_LOG=info,tower_http=info GENARRATIVE_API_LISTEN_BACKLOG=1024 GENARRATIVE_API_WORKER_THREADS=4 GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512 +GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=64 +GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=32 +GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16 GENARRATIVE_OTEL_ENABLED=false OTEL_SERVICE_NAME=genarrative-api OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318 diff --git a/deploy/nginx/genarrative-dev-http.conf b/deploy/nginx/genarrative-dev-http.conf index 6c9bede4..ed5ca13e 100644 --- a/deploy/nginx/genarrative-dev-http.conf +++ b/deploy/nginx/genarrative-dev-http.conf @@ -14,6 +14,9 @@ upstream genarrative_api { } limit_conn_zone $binary_remote_addr zone=genarrative_api_conn:10m; +limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=650r/s; +limit_req_zone $binary_remote_addr zone=genarrative_api_rps:10m rate=300r/s; +limit_req_zone $binary_remote_addr zone=genarrative_admin_rps:10m rate=30r/s; server { listen 80; @@ -22,6 +25,8 @@ server { error_log /var/log/nginx/genarrative.error.log warn; limit_conn_status 429; limit_conn_log_level warn; + limit_req_status 429; + limit_req_log_level warn; gzip on; gzip_vary on; @@ -48,6 +53,7 @@ server { location ^~ /admin/api/ { default_type application/json; limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_admin_rps burst=16 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; @@ -85,10 +91,107 @@ server { try_files $uri =404; } + location = /api/runtime/puzzle/gallery { + default_type application/json; + limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_gallery_rps burst=64 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location = /api/runtime/custom-world-gallery { + default_type application/json; + limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_gallery_rps burst=64 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location ~ ^/api/runtime/puzzle/gallery/[^/]+$ { + default_type application/json; + limit_conn genarrative_api_conn 32; + limit_req zone=genarrative_api_rps burst=32 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location ~ ^/api/runtime/custom-world-gallery/[^/]+/[^/]+$ { + default_type application/json; + limit_conn genarrative_api_conn 32; + limit_req zone=genarrative_api_rps burst=32 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + # 临时兼容主站仍在使用的 /api/* HTTP facade;前端完成 SpacetimeDB SDK 迁移后删除。 location ~ ^/api(?:/|$) { default_type application/json; limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_api_rps burst=64 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; diff --git a/deploy/nginx/genarrative.conf b/deploy/nginx/genarrative.conf index 984dd130..788a1e0d 100644 --- a/deploy/nginx/genarrative.conf +++ b/deploy/nginx/genarrative.conf @@ -12,6 +12,9 @@ upstream genarrative_api { } limit_conn_zone $binary_remote_addr zone=genarrative_api_conn:10m; +limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=650r/s; +limit_req_zone $binary_remote_addr zone=genarrative_api_rps:10m rate=300r/s; +limit_req_zone $binary_remote_addr zone=genarrative_admin_rps:10m rate=30r/s; server { listen 80; @@ -20,6 +23,8 @@ server { error_log /var/log/nginx/genarrative.error.log warn; limit_conn_status 429; limit_conn_log_level warn; + limit_req_status 429; + limit_req_log_level warn; location /.well-known/acme-challenge/ { root /var/www/html; @@ -35,6 +40,10 @@ server { server_name genarrative.example.com; access_log /var/log/nginx/genarrative.access.log genarrative_upstream; error_log /var/log/nginx/genarrative.error.log warn; + limit_conn_status 429; + limit_conn_log_level warn; + limit_req_status 429; + limit_req_log_level warn; gzip on; gzip_vary on; @@ -64,6 +73,7 @@ server { location ^~ /admin/api/ { default_type application/json; limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_admin_rps burst=16 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; @@ -101,10 +111,107 @@ server { try_files $uri =404; } + location = /api/runtime/puzzle/gallery { + default_type application/json; + limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_gallery_rps burst=64 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location = /api/runtime/custom-world-gallery { + default_type application/json; + limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_gallery_rps burst=64 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location ~ ^/api/runtime/puzzle/gallery/[^/]+$ { + default_type application/json; + limit_conn genarrative_api_conn 32; + limit_req zone=genarrative_api_rps burst=32 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + + location ~ ^/api/runtime/custom-world-gallery/[^/]+/[^/]+$ { + default_type application/json; + limit_conn genarrative_api_conn 32; + limit_req zone=genarrative_api_rps burst=32 nodelay; + + if ($genarrative_maintenance) { + return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; + } + + proxy_pass http://genarrative_api; + proxy_http_version 1.1; + proxy_buffering off; + proxy_read_timeout 3600s; + proxy_send_timeout 3600s; + add_header X-Accel-Buffering no always; + proxy_set_header Connection ""; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header X-Forwarded-Host $host; + proxy_set_header X-Request-Id $request_id; + } + # 临时兼容主站仍在使用的 /api/* HTTP facade;前端完成 SpacetimeDB SDK 迁移后删除。 location ~ ^/api(?:/|$) { default_type application/json; limit_conn genarrative_api_conn 64; + limit_req zone=genarrative_api_rps burst=64 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; diff --git a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md index 2ab0e347..6ef7fe0b 100644 --- a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md +++ b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md @@ -154,16 +154,16 @@ Jenkins 按 web / api / Spacetime module / build / deploy / publish 拆分 50 HTTP req/s 首版压测优化口径: - `api-server` 生产模板默认 `GENARRATIVE_API_LISTEN_BACKLOG=1024`、`GENARRATIVE_API_WORKER_THREADS=4`;本地未设置 worker threads 时继续使用 Tokio 默认值。 -- `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512` 开启应用内 HTTP 并发背压,超过并发许可时直接返回 `429 Too Many Requests` 和 `Retry-After: 1`,`/healthz` 不受该限制。该值不是 RPS 限速;如果压测中 429 上升但内存和 p95 收敛,说明背压正在保护进程,需要结合真实容量调阈值或在 Nginx 前置限流。直连 `api-server` 的极高 RPS 压测若出现 `connection refused`,通常已经打到 TCP 监听 / accept 层,应同时检查 backlog、Nginx upstream keepalive 和前置限流。 +- `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512` 开启应用内 HTTP 并发背压;`GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=64`、`GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=32`、`GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16` 分别限制公开列表、公开详情和后台 API 热路径。超过许可时直接返回 `429 Too Many Requests` 和 `Retry-After: 1`,`/healthz` 不受该限制。这些值不是 RPS 限速;如果压测中 429 上升但内存和 p95 收敛,说明背压正在保护进程。直连 `api-server` 的极高 RPS 压测若出现 `connection refused`,通常已经打到 TCP 监听 / accept 层,应同时检查 backlog、Nginx upstream keepalive 和前置限流。 - `genarrative-api.service` 设置 `LimitNOFILE=65535`、`TasksMax=2048`;上线后用 `systemctl show genarrative-api.service -p LimitNOFILE -p TasksMax` 和 `cat /proc/$(pidof api-server)/limits` 核对。 - Server provision 不在目标机下载 SpacetimeDB 或 `otelcol-contrib`。Jenkins 的 `Prepare Provision Tools` 阶段在 `linux && genarrative-build` 构建机执行 `scripts/prepare-server-provision-tools.sh`,通过官方 SpacetimeDB 安装入口 `https://install.spacetimedb.com` 和 OpenTelemetry release 包生成 `provision-tools/`,再通过 `stash/unstash` 上传到 release 部署 agent。目标机上的 `scripts/jenkins-server-provision.sh` 只从该工作区工具包安装 `/stdb/spacetime`、`/stdb/bin/current/*` 和 `/usr/local/bin/otelcol-contrib`。 - `otelcol-contrib.service` 作为可选系统服务加入 provision,默认监听 `127.0.0.1:4317/4318` 并使用 `deploy/otelcol/genarrative-debug.yaml`。api-server 是否发送 OTLP 仍由 `GENARRATIVE_OTEL_ENABLED` 控制,服务 unit 见 `deploy/systemd/otelcol-contrib.service`。 -- Nginx `/api/` 与 `/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`,upstream keepalive 为 64;压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time`、`upstream_connect_time`、`upstream_header_time`、`upstream_response_time`、`upstream_status`、`request_id`。 +- Nginx `/api/` 与 `/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`,upstream keepalive 为 64;`limit_conn` 负责连接 / 并发保护,`limit_req` 负责入口 RPS 快拒绝。当前模板把公开 gallery list 单独放到 `genarrative_gallery_rps`,公开详情和普通 API 放到 `genarrative_api_rps`,后台 API 放到 `genarrative_admin_rps`;压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time`、`upstream_connect_time`、`upstream_header_time`、`upstream_response_time`、`upstream_status`、`request_id`。 - 作品列表 K6 脚本一次 iteration 默认请求两个公开接口,因此约 50 HTTP req/s 的目标命令使用 `SCENARIO=spike START_RPS=5 PEAK_RPS=25 HOLD=60s END_RPS=5 DETAIL_RATIO=0 npm run loadtest:k6:works`。 - 作品列表短期继续由 `api-server` / BFF 订阅 SpacetimeDB 公开 read model 后读本地 cache,不让浏览器前端直接订阅完整列表;未来如新增 `public_work_gallery_entry` 等专用公开作品列表 read model,前端只可订阅稳定、低基数、公开的专用投影,禁止订阅 `puzzle_work_profile`、`custom_world_profile` 等玩法源表后自行 join、聚合或判断权限。前端直订阅落地前必须先补齐权限、字段契约、排序 / 分页、埋点和 BFF 回退策略。 - 50 HTTP req/s 验收目标为 `http_req_failed < 1%`、`p95 < 2s`、`dropped_iterations = 0`,同时压测窗口内 Nginx 无新增 502。 -容器化压测与隔离部署方案单独放在 `deploy/container/`,用于本机或预发模拟 Linux release + Nginx + OTLP Collector 拓扑,不替换当前生产 `systemd + Nginx + Jenkins` 发布路径。当前容器模拟参数按 `genarrative-release` 采样值收口为 2 vCPU / 2 GiB RAM / `nofile=4096` / `worker_connections=768`,并在 compose 里落实到 `spacetimedb cpus=1.0 mem_limit=768m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.25 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=0.5 mem_limit=512m`: +容器化压测与隔离部署方案单独放在 `deploy/container/`,用于本机或预发模拟 Linux release + Nginx + OTLP Collector 拓扑,不替换当前生产 `systemd + Nginx + Jenkins` 发布路径。当前容器模拟参数按 `genarrative-release` 采样值收口为 2 vCPU / 2 GiB RAM / `nofile=4096` / `worker_connections=768`,并在 compose 里落实到 `spacetimedb cpus=1.0 mem_limit=768m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.25 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=0.5 mem_limit=512m`。容器 `api-server` 默认 `GENARRATIVE_API_WORKER_THREADS=4`,只增加 Tokio worker 调度并发,不突破 `api-server cpus=2.0` 的 CPU 配额: ```bash npm run container:init @@ -185,8 +185,8 @@ OpenTelemetry 现阶段可选 OTLP traces / metrics / logs,但本地日志与 - api-server 当前发 OTLP HTTP,`OTEL_EXPORTER_OTLP_ENDPOINT` 指向 Collector HTTP base endpoint;不要改到 gRPC `4317` 或 Rider 端口,Rider 由 Collector 通过 `RIDER_OTLP_GRPC_ENDPOINT` 转发。 - 应用日志仍通过 `journalctl -u genarrative-api.service` 查看,Nginx 日志仍写文件;日志等级继续用 `GENARRATIVE_API_LOG` / `RUST_LOG` 控制,例如 `info,tower_http=info,spacetime_client=info`。 - debug exporter / Rider 转发都会同时接收 traces、metrics 和 logs。 -- api-server 会随 metrics 发送进程级指标:`process.memory.usage`、`process.memory.virtual`、`process.thread.count`、`genarrative.process.memory.private`;Windows 额外发送 `process.windows.handle.count`,Linux 额外发送 `process.unix.file_descriptor.count`。这些指标只描述当前进程,不携带请求、用户或作品 label。 -- HTTP 运行态补充发送 `genarrative.http.server.response_bodies.in_flight` 与 `genarrative.http.server.request_permits.available`,用于区分业务 handler / 背压 permit 是否仍被占用;拼图广场热点缓存补充发送 `genarrative.puzzle_gallery.cache.*` 指标,记录命中、未命中、重建耗时和预序列化 data JSON 字节数。 +- api-server 会随 metrics 发送进程级指标:`process.memory.usage`、`process.memory.virtual`、`process.cpu.time`、`genarrative.process.cpu.usage_percent`、`process.thread.count`、`genarrative.process.memory.private`;Windows 额外发送 `process.windows.handle.count`,Linux 额外发送 `process.unix.file_descriptor.count`。这些指标只描述当前进程,不携带请求、用户或作品 label。 +- HTTP 运行态补充发送 `genarrative.http.server.response_bodies.in_flight` 与 `genarrative.http.server.request_permits.available`,后者带低基数 `pool=default|gallery|detail|admin` label,用于区分业务 handler / 背压 permit 是否仍被占用;拼图广场热点缓存补充发送 `genarrative.puzzle_gallery.cache.*` 指标,记录 fresh hit、stale hit、未命中、后台刷新开始 / 失败、重建耗时和预序列化 data JSON 字节数。 - SpacetimeDB 观测分为两类:procedure / reducer 调用继续用 `genarrative.spacetime.procedure.*`,订阅本地 cache 读使用 `genarrative.spacetime.read.*`。`read=list_puzzle_gallery` 表示拼图广场当前从 `puzzle_gallery_card_view` 本地 cache 读取,不再每个 HTTP 请求调用 `list_puzzle_gallery` procedure。 - 本地 Windows 直连压测的内存高水位要结合 K6 VU / 连接数解释。250 RPS 下过高 `PREALLOCATED_VUS` 可能让 300 个本地 Established 连接把 `api-server` private memory 瞬时推到 GB 级,且 `/healthz` 小响应也能复现;若压测结束后回落、`response_bodies.in_flight` 和背压 permit 未显示业务积压,应优先按连接 / 发送链路高水位处理,而不是判断为 SpacetimeDB 或 JSON 缓存泄漏。 - Rider 的 Logs 面板只展示 log event 自身字段,不会自动展开父 span 的全部 attributes;请求完成日志会直接带 `request_id`、`http.request.method`、`http.route`、`url.scheme`、`url.path`、`http.response.status_code`、`status_class`、`latency_ms` 和 `slow_request`,完整链路继续到 Traces 面板按 trace/span 查看。 diff --git a/scripts/loadtest/README.md b/scripts/loadtest/README.md index ef2e0307..cb2d38f1 100644 --- a/scripts/loadtest/README.md +++ b/scripts/loadtest/README.md @@ -226,7 +226,7 @@ npm run loadtest:k6:works ## 排障 - 如果公开 gallery 返回 `creation_entry_disabled` 或 503,检查本地 creation entry 配置是否禁用了对应入口。 -- 如果高压下返回 429,优先确认目标环境是否设置了 `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS`。429 表示 api-server 应用层背压已生效,不等同于业务错误;继续看内存、p95、`http_req_failed` 和 OTLP / Nginx timing 判断阈值是否偏低。 +- 如果高压下返回 429,优先确认目标环境是否设置了 `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS` 以及 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS`、`GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS`、`GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS`。429 表示 Nginx 或 api-server 背压已生效,不等同于业务错误;继续看内存、p95、`http_req_failed` 和 OTLP / Nginx timing 判断阈值是否偏低。 - 如果直连 `api-server` 压测出现 `connection refused` 或 status 0,说明压力已经打到 TCP 监听 / accept 层;此时同时检查 `GENARRATIVE_API_LISTEN_BACKLOG`、Nginx upstream keepalive 和是否需要在 Nginx 前置限流,不能只靠应用层背压解释。 - 如果个人作品列表返回 401,确认 `AUTH_TOKEN` 是当前 api-server 可识别的 access token。 - 如果详情全部 404,确认是否已向目标环境导入与 `WORKS_DATA` 一致的数据。 @@ -317,12 +317,14 @@ Rider 的 Logs 面板展示的是 OTLP log event 自身字段,不会自动把 - `process.memory.usage`:进程常驻内存 / RSS。 - `process.memory.virtual`:进程虚拟内存;Windows 当前按 `PrivateUsage` 上报,Linux 取 `VmSize`。 - `genarrative.process.memory.private`:进程私有内存,Windows 来自 `PrivateUsage`,Linux 近似取 `/proc/self/status` 的 `VmData`。 +- `process.cpu.time`:进程 user + system 累计 CPU 秒数。 +- `genarrative.process.cpu.usage_percent`:两次指标采集之间的进程 CPU 使用率;100% 约等于占满 1 个 CPU core。 - `process.thread.count`:线程数。 - `process.windows.handle.count`:Windows 句柄数。 - `process.unix.file_descriptor.count`:Linux 文件描述符数。 - `genarrative.http.server.response_bodies.in_flight`:Axum / Hyper 仍持有的响应 body 数;如果内存高但该值很低,说明热点不在业务 handler 生命周期内。 -- `genarrative.http.server.request_permits.available`:应用层 HTTP 背压剩余 permit 数;如果该值未接近 0,说明没有打满 `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS`。 -- `genarrative.puzzle_gallery.cache.hits` / `genarrative.puzzle_gallery.cache.misses` / `genarrative.puzzle_gallery.cache.rebuilds`:拼图广场响应缓存命中、未命中和重建次数。 +- `genarrative.http.server.request_permits.available`:应用层 HTTP 背压剩余 permit 数,带 `pool=default|gallery|detail|admin`;如果目标 pool 未接近 0,说明没有打满对应 `GENARRATIVE_API_*_MAX_CONCURRENT_REQUESTS`。 +- `genarrative.puzzle_gallery.cache.hits` / `genarrative.puzzle_gallery.cache.stale_hits` / `genarrative.puzzle_gallery.cache.misses` / `genarrative.puzzle_gallery.cache.refreshes_started` / `genarrative.puzzle_gallery.cache.refreshes_failed` / `genarrative.puzzle_gallery.cache.rebuilds`:拼图广场响应缓存 fresh 命中、stale 命中、未命中、后台刷新和重建次数。 - `genarrative.puzzle_gallery.cache.rebuild.duration`:拼图广场缓存重建耗时。 - `genarrative.puzzle_gallery.cache.data_json_bytes`:拼图广场缓存内预序列化 data JSON 大小。 - `genarrative.spacetime.read.calls` / `genarrative.spacetime.read.duration_ms`:SpacetimeDB 订阅本地 cache 读次数和耗时;`read=list_puzzle_gallery` 表示当前路径走 view / local cache,不是 procedure。 @@ -336,7 +338,7 @@ Rider 的 Logs 面板展示的是 OTLP log event 自身字段,不会自动把 ```bash systemctl show genarrative-api.service -p LimitNOFILE -p TasksMax cat /proc/$(pidof api-server)/limits -tr '\0' '\n' < /proc/$(pidof api-server)/environ | grep GENARRATIVE_API_MAX_CONCURRENT_REQUESTS +tr '\0' '\n' < /proc/$(pidof api-server)/environ | grep 'GENARRATIVE_API_.*MAX_CONCURRENT_REQUESTS' ss -ltnp | grep 8082 curl -sS http://127.0.0.1:8082/healthz ``` diff --git a/scripts/loadtest/data/works-list.sample.json b/scripts/loadtest/data/works-list.sample.json index 250157bd..d60cf0c0 100644 --- a/scripts/loadtest/data/works-list.sample.json +++ b/scripts/loadtest/data/works-list.sample.json @@ -1,10 +1,12 @@ { - "source": "spacetime-migration-7.local.json", - "generatedAt": "2026-05-11T13:09:51.569Z", + "source": "spacetime-migration-1.json", + "generatedAt": "2026-05-18T11:54:04.280Z", "counts": { "puzzle_work_profile": 3, "custom_world_profile": 1, - "match3d_work_profile": 0 + "match3d_work_profile": 0, + "square_hole_work_profile": 0, + "visual_novel_work_profile": 0 }, "tables": { "puzzle_work_profile": [ @@ -113,7 +115,9 @@ } } ], - "match3d_work_profile": [] + "match3d_work_profile": [], + "square_hole_work_profile": [], + "visual_novel_work_profile": [] }, "profileIds": { "puzzle": [ diff --git a/scripts/loadtest/k6-works-list.js b/scripts/loadtest/k6-works-list.js index 67d6abd0..95f0d212 100644 --- a/scripts/loadtest/k6-works-list.js +++ b/scripts/loadtest/k6-works-list.js @@ -137,12 +137,12 @@ function unwrapPayload(json) { } function hasCollection(payload, keys) { - return keys.some((key) => Array.isArray(payload?.[key])); + return Boolean(payload) && keys.some((key) => Array.isArray(payload[key])); } function firstCollection(payload, keys) { for (const key of keys) { - if (Array.isArray(payload?.[key])) return payload[key]; + if (payload && Array.isArray(payload[key])) return payload[key]; } return []; } @@ -152,10 +152,11 @@ function hasListItemShape(payload, keys) { if (collection.length === 0) return true; const item = collection[0]; const hasId = Boolean( - item?.profileId || item?.profile_id || item?.workId || item?.work_id || item?.publicWorkCode, + item && + (item.profileId || item.profile_id || item.workId || item.work_id || item.publicWorkCode), ); const hasTitle = Boolean( - item?.title || item?.workTitle || item?.work_title || item?.levelName || item?.worldName, + item && (item.title || item.workTitle || item.work_title || item.levelName || item.worldName), ); return hasId && hasTitle; } @@ -213,7 +214,8 @@ function performDetailRequest() { const payload = unwrapPayload(json); const ok = check(response, { [`${endpoint.name} status is 200`]: (res) => res.status === 200, - [`${endpoint.name} has detail payload`]: () => endpoint.expectKeys.some((key) => payload?.[key]), + [`${endpoint.name} has detail payload`]: () => + Boolean(payload) && endpoint.expectKeys.some((key) => payload[key]), }); worksDetailShapeErrorRate.add(!ok, { endpoint: endpoint.name }); } diff --git a/server-rs/crates/api-server/src/app.rs b/server-rs/crates/api-server/src/app.rs index ec886eb2..e5e4f27c 100644 --- a/server-rs/crates/api-server/src/app.rs +++ b/server-rs/crates/api-server/src/app.rs @@ -1,7 +1,7 @@ use axum::{ Router, body::Body, - extract::Extension, + extract::{Extension, FromRef}, http::Request, middleware, response::Response, @@ -22,7 +22,7 @@ use crate::{ request_context::{RequestContext, attach_request_context, resolve_request_id}, response_headers::propagate_request_id_header, runtime_inventory::get_runtime_inventory_state, - state::AppState, + state::{AppState, BackpressureState}, telemetry::record_http_observability, tracking::record_route_tracking_event_after_success, vector_engine_audio_generation::{ @@ -79,7 +79,7 @@ pub fn build_router(state: AppState) -> Router { )) // HTTP 背压在业务路由外侧快拒绝,避免过载请求继续占用 SpacetimeDB facade 与业务执行资源。 .layer(middleware::from_fn_with_state( - state.clone(), + BackpressureState::from_ref(&state), limit_concurrent_requests, )) // 错误归一化层放在 tracing 里侧,让 tracing 记录到最终对外返回的状态与错误体形态。 diff --git a/server-rs/crates/api-server/src/backpressure.rs b/server-rs/crates/api-server/src/backpressure.rs index 6f9c5122..3fc2b689 100644 --- a/server-rs/crates/api-server/src/backpressure.rs +++ b/server-rs/crates/api-server/src/backpressure.rs @@ -13,11 +13,11 @@ use tokio::sync::{OwnedSemaphorePermit, TryAcquireError}; use crate::{ http_error::AppError, request_context::RequestContext, - state::{AppState, HttpRequestPermitPool}, + state::{BackpressureState, HttpRequestPermitPool, HttpRequestPermitPoolKind}, }; pub async fn limit_concurrent_requests( - State(state): State, + State(state): State, request: Request, next: Next, ) -> Response { @@ -25,29 +25,38 @@ pub async fn limit_concurrent_requests( return next.run(request).await; } - let Some(permit_pool) = state.http_request_permit_pool() else { + let requested_pool = classify_request_permit_pool(request.uri().path()); + let Some((permit_pool_kind, permit_pool)) = state.request_permit_pool(requested_pool) else { return next.run(request).await; }; - match acquire_http_request_permit(permit_pool) { + match acquire_http_request_permit(permit_pool_kind, permit_pool) { Ok(permit) => hold_permit_until_response_body_dropped(next.run(request).await, permit), Err(_) => reject_overloaded_request(&request), } } fn acquire_http_request_permit( + permit_pool_kind: HttpRequestPermitPoolKind, permit_pool: Arc, ) -> Result { match permit_pool.clone().try_acquire_owned() { Ok(permit) => { - crate::telemetry::update_http_request_permits_available(permit_pool.available_permits()); + crate::telemetry::update_http_request_permits_available( + permit_pool_kind, + permit_pool.available_permits(), + ); Ok(HttpRequestPermitGuard { + permit_pool_kind, permit: Some(permit), permit_pool, }) } Err(error) => { - crate::telemetry::update_http_request_permits_available(permit_pool.available_permits()); + crate::telemetry::update_http_request_permits_available( + permit_pool_kind, + permit_pool.available_permits(), + ); Err(error) } } @@ -66,6 +75,7 @@ fn hold_permit_until_response_body_dropped( } struct HttpRequestPermitGuard { + permit_pool_kind: HttpRequestPermitPoolKind, permit: Option, permit_pool: Arc, } @@ -73,7 +83,10 @@ struct HttpRequestPermitGuard { impl Drop for HttpRequestPermitGuard { fn drop(&mut self) { drop(self.permit.take()); - crate::telemetry::update_http_request_permits_available(self.permit_pool.available_permits()); + crate::telemetry::update_http_request_permits_available( + self.permit_pool_kind, + self.permit_pool.available_permits(), + ); } } @@ -92,6 +105,44 @@ fn should_bypass_backpressure(request: &Request) -> bool { request.uri().path() == "/healthz" } +fn classify_request_permit_pool(path: &str) -> HttpRequestPermitPoolKind { + if is_gallery_list_path(path) { + HttpRequestPermitPoolKind::Gallery + } else if is_gallery_detail_path(path) { + HttpRequestPermitPoolKind::Detail + } else if path.starts_with("/admin/api/") { + HttpRequestPermitPoolKind::Admin + } else { + HttpRequestPermitPoolKind::Default + } +} + +fn is_gallery_list_path(path: &str) -> bool { + matches!( + path, + "/api/runtime/puzzle/gallery" | "/api/runtime/custom-world-gallery" + ) +} + +fn is_gallery_detail_path(path: &str) -> bool { + let puzzle_prefix = "/api/runtime/puzzle/gallery/"; + if let Some(profile_id) = path.strip_prefix(puzzle_prefix) { + return !profile_id.is_empty() && !profile_id.contains('/'); + } + + let custom_world_prefix = "/api/runtime/custom-world-gallery/"; + if let Some(remainder) = path.strip_prefix(custom_world_prefix) { + let mut segments = remainder.split('/'); + return matches!( + (segments.next(), segments.next(), segments.next()), + (Some(owner_user_id), Some(profile_id), None) + if !owner_user_id.is_empty() && !profile_id.is_empty() + ); + } + + false +} + #[cfg(test)] mod tests { use std::sync::Arc; @@ -107,9 +158,14 @@ mod tests { use tokio::sync::Notify; use tower::ServiceExt; - use crate::{config::AppConfig, state::AppState}; + use axum::extract::FromRef; - use super::limit_concurrent_requests; + use crate::{ + config::AppConfig, + state::{AppState, BackpressureState}, + }; + + use super::{classify_request_permit_pool, limit_concurrent_requests}; #[derive(Clone)] struct HeldRequestGate { @@ -138,13 +194,50 @@ mod tests { let mut config = AppConfig::default(); config.max_concurrent_requests = Some(max_concurrent_requests); let state = AppState::new(config).expect("state should build"); + let backpressure_state = BackpressureState::from_ref(&state); Router::new() .route("/held", get(held_request)) .route("/fast", get(fast_request)) .route("/healthz", get(fast_request)) .layer(middleware::from_fn_with_state( - state.clone(), + backpressure_state, + limit_concurrent_requests, + )) + .layer(Extension(gate)) + .with_state(state) + } + + fn build_grouped_test_app( + default_max_concurrent_requests: usize, + gallery_max_concurrent_requests: usize, + admin_max_concurrent_requests: usize, + gate: HeldRequestGate, + ) -> Router { + let mut config = AppConfig::default(); + config.max_concurrent_requests = Some(default_max_concurrent_requests); + config.gallery_max_concurrent_requests = Some(gallery_max_concurrent_requests); + config.admin_max_concurrent_requests = Some(admin_max_concurrent_requests); + let state = AppState::new(config).expect("state should build"); + let backpressure_state = BackpressureState::from_ref(&state); + + Router::new() + .route("/held", get(held_request)) + .route("/api/runtime/puzzle/gallery", get(held_request)) + .route("/api/runtime/custom-world-gallery", get(held_request)) + .route("/api/runtime/puzzle/gallery/profile-1", get(held_request)) + .route( + "/api/runtime/puzzle/gallery/profile-1/like", + get(fast_request), + ) + .route( + "/api/runtime/custom-world-gallery/user-1/profile-1", + get(held_request), + ) + .route("/admin/api/overview", get(held_request)) + .route("/fast", get(fast_request)) + .layer(middleware::from_fn_with_state( + backpressure_state, limit_concurrent_requests, )) .layer(Extension(gate)) @@ -242,4 +335,147 @@ mod tests { .expect("third request should complete"); assert_eq!(accepted_response.status(), StatusCode::OK); } + + #[tokio::test] + async fn gallery_pool_rejects_gallery_without_blocking_default_routes() { + let gate = HeldRequestGate { + entered: Arc::new(Notify::new()), + release: Arc::new(Notify::new()), + }; + let app = build_grouped_test_app(2, 1, 1, gate.clone()); + let entered = gate.entered.notified(); + + let held_response = tokio::spawn( + app.clone() + .oneshot(test_request("/api/runtime/puzzle/gallery")), + ); + entered.await; + + let rejected_gallery_response = app + .clone() + .oneshot(test_request("/api/runtime/custom-world-gallery")) + .await + .expect("rejected gallery request should complete"); + assert_eq!( + rejected_gallery_response.status(), + StatusCode::TOO_MANY_REQUESTS + ); + + let accepted_default_response = app + .clone() + .oneshot(test_request("/fast")) + .await + .expect("default request should complete"); + assert_eq!(accepted_default_response.status(), StatusCode::OK); + + gate.release.notify_one(); + let completed_response = held_response + .await + .expect("held request task should join") + .expect("held request should complete"); + assert_eq!(completed_response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn detail_pool_falls_back_to_default_when_unset() { + let gate = HeldRequestGate { + entered: Arc::new(Notify::new()), + release: Arc::new(Notify::new()), + }; + let mut config = AppConfig::default(); + config.max_concurrent_requests = Some(1); + config.detail_max_concurrent_requests = None; + let state = AppState::new(config).expect("state should build"); + let backpressure_state = BackpressureState::from_ref(&state); + let app = Router::new() + .route("/api/runtime/puzzle/gallery/profile-1", get(held_request)) + .route("/fast", get(fast_request)) + .layer(middleware::from_fn_with_state( + backpressure_state, + limit_concurrent_requests, + )) + .layer(Extension(gate.clone())) + .with_state(state); + let entered = gate.entered.notified(); + + let held_response = tokio::spawn( + app.clone() + .oneshot(test_request("/api/runtime/puzzle/gallery/profile-1")), + ); + entered.await; + + let rejected_default_response = app + .clone() + .oneshot(test_request("/fast")) + .await + .expect("default request should complete"); + assert_eq!( + rejected_default_response.status(), + StatusCode::TOO_MANY_REQUESTS + ); + + gate.release.notify_one(); + let completed_response = held_response + .await + .expect("held request task should join") + .expect("held request should complete"); + assert_eq!(completed_response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn admin_pool_is_isolated_from_default_routes() { + let gate = HeldRequestGate { + entered: Arc::new(Notify::new()), + release: Arc::new(Notify::new()), + }; + let app = build_grouped_test_app(2, 1, 1, gate.clone()); + let entered = gate.entered.notified(); + + let held_response = tokio::spawn(app.clone().oneshot(test_request("/admin/api/overview"))); + entered.await; + + let rejected_admin_response = app + .clone() + .oneshot(test_request("/admin/api/overview")) + .await + .expect("rejected admin request should complete"); + assert_eq!( + rejected_admin_response.status(), + StatusCode::TOO_MANY_REQUESTS + ); + + let accepted_default_response = app + .clone() + .oneshot(test_request("/fast")) + .await + .expect("default request should complete"); + assert_eq!(accepted_default_response.status(), StatusCode::OK); + + gate.release.notify_one(); + let completed_response = held_response + .await + .expect("held request task should join") + .expect("held request should complete"); + assert_eq!(completed_response.status(), StatusCode::OK); + } + + #[test] + fn classifies_only_exact_gallery_detail_paths_as_detail() { + assert_eq!( + classify_request_permit_pool("/api/runtime/puzzle/gallery/profile-1"), + crate::state::HttpRequestPermitPoolKind::Detail + ); + assert_eq!( + classify_request_permit_pool("/api/runtime/puzzle/gallery/profile-1/like"), + crate::state::HttpRequestPermitPoolKind::Default + ); + assert_eq!( + classify_request_permit_pool("/api/runtime/custom-world-gallery/user-1/profile-1"), + crate::state::HttpRequestPermitPoolKind::Detail + ); + assert_eq!( + classify_request_permit_pool("/api/runtime/custom-world-gallery/user-1/profile-1/like"), + crate::state::HttpRequestPermitPoolKind::Default + ); + } } diff --git a/server-rs/crates/api-server/src/config.rs b/server-rs/crates/api-server/src/config.rs index 13a62372..e2e965f6 100644 --- a/server-rs/crates/api-server/src/config.rs +++ b/server-rs/crates/api-server/src/config.rs @@ -23,6 +23,9 @@ pub struct AppConfig { pub listen_backlog: i32, pub worker_threads: Option, pub max_concurrent_requests: Option, + pub gallery_max_concurrent_requests: Option, + pub detail_max_concurrent_requests: Option, + pub admin_max_concurrent_requests: Option, pub log_filter: String, pub otel_enabled: bool, pub admin_username: Option, @@ -154,6 +157,9 @@ impl Default for AppConfig { listen_backlog: 1024, worker_threads: None, max_concurrent_requests: None, + gallery_max_concurrent_requests: None, + detail_max_concurrent_requests: None, + admin_max_concurrent_requests: None, log_filter: "info,tower_http=info".to_string(), otel_enabled: false, admin_username: None, @@ -322,6 +328,21 @@ impl AppConfig { { config.max_concurrent_requests = Some(max_concurrent_requests); } + if let Some(max_concurrent_requests) = + read_first_usize_env(&["GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS"]) + { + config.gallery_max_concurrent_requests = Some(max_concurrent_requests); + } + if let Some(max_concurrent_requests) = + read_first_usize_env(&["GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS"]) + { + config.detail_max_concurrent_requests = Some(max_concurrent_requests); + } + if let Some(max_concurrent_requests) = + read_first_usize_env(&["GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS"]) + { + config.admin_max_concurrent_requests = Some(max_concurrent_requests); + } if let Some(otel_enabled) = read_first_bool_env(&["GENARRATIVE_OTEL_ENABLED"]) { config.otel_enabled = otel_enabled; } @@ -1206,10 +1227,16 @@ mod tests { std::env::remove_var("GENARRATIVE_API_LISTEN_BACKLOG"); std::env::remove_var("GENARRATIVE_API_WORKER_THREADS"); std::env::remove_var("GENARRATIVE_API_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS"); std::env::remove_var("GENARRATIVE_OTEL_ENABLED"); std::env::set_var("GENARRATIVE_API_LISTEN_BACKLOG", "2048"); std::env::set_var("GENARRATIVE_API_WORKER_THREADS", "6"); std::env::set_var("GENARRATIVE_API_MAX_CONCURRENT_REQUESTS", "128"); + std::env::set_var("GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS", "64"); + std::env::set_var("GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS", "32"); + std::env::set_var("GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS", "16"); std::env::set_var("GENARRATIVE_OTEL_ENABLED", "true"); } @@ -1217,12 +1244,18 @@ mod tests { assert_eq!(config.listen_backlog, 2048); assert_eq!(config.worker_threads, Some(6)); assert_eq!(config.max_concurrent_requests, Some(128)); + assert_eq!(config.gallery_max_concurrent_requests, Some(64)); + assert_eq!(config.detail_max_concurrent_requests, Some(32)); + assert_eq!(config.admin_max_concurrent_requests, Some(16)); assert!(config.otel_enabled); unsafe { std::env::remove_var("GENARRATIVE_API_LISTEN_BACKLOG"); std::env::remove_var("GENARRATIVE_API_WORKER_THREADS"); std::env::remove_var("GENARRATIVE_API_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS"); std::env::remove_var("GENARRATIVE_OTEL_ENABLED"); } } diff --git a/server-rs/crates/api-server/src/process_metrics.rs b/server-rs/crates/api-server/src/process_metrics.rs index 5f27c8b8..4d3adad2 100644 --- a/server-rs/crates/api-server/src/process_metrics.rs +++ b/server-rs/crates/api-server/src/process_metrics.rs @@ -1,4 +1,7 @@ -use std::sync::OnceLock; +use std::{ + sync::{Mutex, OnceLock}, + time::Instant, +}; use opentelemetry::global; use tracing::warn; @@ -52,6 +55,38 @@ fn register_process_metrics_once() { }) .build(); + meter + .f64_observable_counter("process.cpu.time") + .with_unit("s") + .with_description("api-server total user plus system CPU time") + .with_callback(|observer| { + let Some(snapshot) = ProcessMetricsSnapshot::collect() else { + return; + }; + if let Some(cpu_time_seconds) = snapshot.cpu_time_seconds { + observer.observe(cpu_time_seconds, &[]); + } + }) + .build(); + + meter + .f64_observable_gauge("genarrative.process.cpu.usage_percent") + .with_unit("%") + .with_description("api-server process CPU usage between metric collections") + .with_callback(|observer| { + let Some(snapshot) = ProcessMetricsSnapshot::collect() else { + return; + }; + if let Some(cpu_time_seconds) = snapshot.cpu_time_seconds { + if let Some(usage_percent) = + process_cpu_usage_percent(cpu_time_seconds, Instant::now()) + { + observer.observe(usage_percent, &[]); + } + } + }) + .build(); + meter .i64_observable_up_down_counter("process.thread.count") .with_unit("{thread}") @@ -97,11 +132,12 @@ fn to_i64(value: u64) -> i64 { value.min(i64::MAX as u64) as i64 } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] struct ProcessMetricsSnapshot { rss_bytes: u64, private_bytes: Option, virtual_bytes: Option, + cpu_time_seconds: Option, thread_count: u64, windows_handle_count: Option, unix_fd_count: Option, @@ -111,12 +147,56 @@ impl ProcessMetricsSnapshot { fn collect() -> Option { collect_process_metrics() .inspect_err(|error| { - warn!(%error, "采集 api-server 进程内存指标失败"); + warn!(%error, "采集 api-server 进程指标失败"); }) .ok() } } +#[derive(Debug, Clone, Copy)] +struct CpuUsageSample { + cpu_time_seconds: f64, + observed_at: Instant, +} + +fn process_cpu_usage_percent(cpu_time_seconds: f64, observed_at: Instant) -> Option { + static LAST_SAMPLE: OnceLock>> = OnceLock::new(); + + let mut last_sample = LAST_SAMPLE.get_or_init(|| Mutex::new(None)).lock().ok()?; + let previous = *last_sample; + *last_sample = Some(CpuUsageSample { + cpu_time_seconds, + observed_at, + }); + + let previous = previous?; + let wall_delta_seconds = observed_at + .checked_duration_since(previous.observed_at)? + .as_secs_f64(); + cpu_usage_ratio_between_samples( + previous.cpu_time_seconds, + cpu_time_seconds, + 0.0, + wall_delta_seconds, + ) + .map(|ratio| ratio * 100.0) +} + +fn cpu_usage_ratio_between_samples( + previous_cpu_seconds: f64, + current_cpu_seconds: f64, + previous_wall_seconds: f64, + current_wall_seconds: f64, +) -> Option { + let cpu_delta_seconds = current_cpu_seconds - previous_cpu_seconds; + let wall_delta_seconds = current_wall_seconds - previous_wall_seconds; + if cpu_delta_seconds < 0.0 || wall_delta_seconds <= 0.0 { + return None; + } + + Some(cpu_delta_seconds / wall_delta_seconds) +} + #[cfg(windows)] fn collect_process_metrics() -> Result { use windows_sys::Win32::{ @@ -149,16 +229,52 @@ fn collect_process_metrics() -> Result { Some(u64::from(handle_count)) }; + let cpu_time_seconds = windows_process_cpu_time_seconds(handle); + Ok(ProcessMetricsSnapshot { rss_bytes: counters.WorkingSetSize as u64, private_bytes: Some(counters.PrivateUsage as u64), virtual_bytes: Some(counters.PrivateUsage as u64), + cpu_time_seconds, thread_count: u64::from(unsafe { GetCurrentProcessId() }.thread_count()?), windows_handle_count: handle_count, unix_fd_count: None, }) } +#[cfg(windows)] +fn windows_process_cpu_time_seconds(handle: windows_sys::Win32::Foundation::HANDLE) -> Option { + use windows_sys::Win32::{ + Foundation::FILETIME, + System::Threading::GetProcessTimes, + }; + + let mut creation_time = FILETIME::default(); + let mut exit_time = FILETIME::default(); + let mut kernel_time = FILETIME::default(); + let mut user_time = FILETIME::default(); + let ok = unsafe { + GetProcessTimes( + handle, + &mut creation_time, + &mut exit_time, + &mut kernel_time, + &mut user_time, + ) + }; + if ok == 0 { + return None; + } + + let total_100ns = filetime_100ns(kernel_time) + filetime_100ns(user_time); + Some(total_100ns as f64 / 10_000_000.0) +} + +#[cfg(windows)] +fn filetime_100ns(filetime: windows_sys::Win32::Foundation::FILETIME) -> u64 { + ((filetime.dwHighDateTime as u64) << 32) | u64::from(filetime.dwLowDateTime) +} + #[cfg(windows)] trait WindowsProcessThreadCount { fn thread_count(self) -> Result; @@ -207,6 +323,8 @@ fn collect_process_metrics() -> Result { .map_err(|error| format!("read /proc/self/status failed: {error}"))?; let statm = std::fs::read_to_string("/proc/self/statm") .map_err(|error| format!("read /proc/self/statm failed: {error}"))?; + let stat = std::fs::read_to_string("/proc/self/stat") + .map_err(|error| format!("read /proc/self/stat failed: {error}"))?; let page_size = linux_page_size_bytes()?; let rss_bytes = parse_status_kb(&status, "VmRSS:") @@ -218,6 +336,7 @@ fn collect_process_metrics() -> Result { .or_else(|| parse_statm_pages(&statm, 0).map(|value| value * page_size)) .ok_or_else(|| "missing VmSize/statm size field".to_string())?; let private_bytes = parse_status_kb(&status, "VmData:").map(|value| value * 1024); + let cpu_time_seconds = linux_cpu_time_seconds(&stat)?; let thread_count = parse_status_u64(&status, "Threads:") .ok_or_else(|| "missing Threads field".to_string())?; @@ -225,12 +344,52 @@ fn collect_process_metrics() -> Result { rss_bytes, private_bytes, virtual_bytes: Some(virtual_bytes), + cpu_time_seconds: Some(cpu_time_seconds), thread_count, windows_handle_count: None, unix_fd_count: linux_fd_count(), }) } +#[cfg(target_os = "linux")] +fn linux_cpu_time_seconds(stat: &str) -> Result { + let cpu_ticks = parse_linux_proc_stat_cpu_ticks(stat) + .ok_or_else(|| "missing /proc/self/stat utime/stime fields".to_string())?; + let ticks_per_second = linux_clock_ticks_per_second()?; + Ok(cpu_ticks as f64 / ticks_per_second as f64) +} + +#[cfg(target_os = "linux")] +fn linux_clock_ticks_per_second() -> Result { + static CLOCK_TICKS_PER_SECOND: OnceLock> = OnceLock::new(); + + CLOCK_TICKS_PER_SECOND + .get_or_init(|| { + let output = std::process::Command::new("getconf") + .arg("CLK_TCK") + .output() + .map_err(|error| format!("getconf CLK_TCK failed: {error}"))?; + if !output.status.success() { + return Err(format!("getconf CLK_TCK exited with {}", output.status)); + } + let text = String::from_utf8(output.stdout) + .map_err(|error| format!("getconf CLK_TCK output is not utf8: {error}"))?; + text.trim() + .parse::() + .map_err(|error| format!("parse CLK_TCK failed: {error}")) + }) + .clone() +} + +#[cfg(target_os = "linux")] +fn parse_linux_proc_stat_cpu_ticks(stat: &str) -> Option { + let fields_after_comm = stat.rsplit_once(") ")?.1; + let mut fields = fields_after_comm.split_whitespace(); + let utime = fields.nth(11)?.parse::().ok()?; + let stime = fields.next()?.parse::().ok()?; + Some(utime + stime) +} + #[cfg(target_os = "linux")] fn linux_page_size_bytes() -> Result { let output = std::process::Command::new("getconf") @@ -282,8 +441,12 @@ fn collect_process_metrics() -> Result { #[cfg(test)] mod tests { + use super::cpu_usage_ratio_between_samples; + #[cfg(target_os = "linux")] - use super::{parse_statm_pages, parse_status_kb, parse_status_u64}; + use super::{ + parse_linux_proc_stat_cpu_ticks, parse_statm_pages, parse_status_kb, parse_status_u64, + }; #[cfg(target_os = "linux")] #[test] @@ -303,4 +466,28 @@ mod tests { assert_eq!(parse_statm_pages("100 20 0 0 0 0 0", 1), Some(20)); assert_eq!(parse_statm_pages("100 20", 7), None); } + + #[cfg(target_os = "linux")] + #[test] + fn parses_linux_proc_stat_cpu_ticks_with_space_in_process_name() { + let stat = "123 (api server) S 1 2 3 4 5 6 7 8 9 10 120 30 0 0 20 0 18 0 12345"; + + assert_eq!(parse_linux_proc_stat_cpu_ticks(stat), Some(150)); + } + + #[test] + fn cpu_usage_ratio_uses_cpu_time_delta_over_wall_time() { + assert_eq!( + cpu_usage_ratio_between_samples(10.0, 12.5, 100.0, 101.0), + Some(2.5) + ); + assert_eq!( + cpu_usage_ratio_between_samples(10.0, 9.0, 100.0, 101.0), + None + ); + assert_eq!( + cpu_usage_ratio_between_samples(10.0, 11.0, 100.0, 100.0), + None + ); + } } diff --git a/server-rs/crates/api-server/src/puzzle.rs b/server-rs/crates/api-server/src/puzzle.rs index 4619c613..da9b54d1 100644 --- a/server-rs/crates/api-server/src/puzzle.rs +++ b/server-rs/crates/api-server/src/puzzle.rs @@ -1534,6 +1534,13 @@ pub async fn list_puzzle_gallery( crate::telemetry::record_puzzle_gallery_cache_hit(); return Ok(puzzle_gallery_cached_json(&request_context, response)); } + + if let Some(response) = state.puzzle_gallery_cache().read_stale_response().await { + crate::telemetry::record_puzzle_gallery_cache_stale_hit(); + spawn_puzzle_gallery_cache_refresh(state.clone()); + return Ok(puzzle_gallery_cached_json(&request_context, response)); + } + crate::telemetry::record_puzzle_gallery_cache_miss(); let _rebuild_guard = state.puzzle_gallery_cache().acquire_rebuild_guard().await; if let Some(response) = state.puzzle_gallery_cache().read_fresh_response().await { @@ -1579,7 +1586,57 @@ pub async fn list_puzzle_gallery( cached_response.data_json_len(), ); - Ok(puzzle_gallery_cached_json(&request_context, cached_response)) + Ok(puzzle_gallery_cached_json( + &request_context, + cached_response, + )) +} + +fn spawn_puzzle_gallery_cache_refresh(state: AppState) { + let Some(rebuild_guard) = state + .puzzle_gallery_cache() + .try_acquire_owned_rebuild_guard() + else { + return; + }; + + crate::telemetry::record_puzzle_gallery_cache_refresh_started(); + tokio::spawn(async move { + let _rebuild_guard = rebuild_guard; + let rebuild_started_at = std::time::Instant::now(); + let refresh_result = async { + let items = state.spacetime_client().list_puzzle_gallery().await?; + let response = build_puzzle_gallery_window_response( + items + .into_iter() + .map(|item| map_puzzle_gallery_card_response(&state, item)) + .collect(), + ); + state + .puzzle_gallery_cache() + .store_response(response) + .await + .map_err(|error| SpacetimeClientError::Runtime(error.to_string())) + } + .await; + + match refresh_result { + Ok(cached_response) => { + crate::telemetry::record_puzzle_gallery_cache_rebuild( + rebuild_started_at.elapsed(), + cached_response.data_json_len(), + ); + } + Err(error) => { + crate::telemetry::record_puzzle_gallery_cache_refresh_failed(); + tracing::warn!( + provider = PUZZLE_GALLERY_PROVIDER, + error = %error, + "puzzle gallery cache background refresh failed" + ); + } + } + }); } pub async fn get_puzzle_gallery_detail( diff --git a/server-rs/crates/api-server/src/puzzle_gallery_cache.rs b/server-rs/crates/api-server/src/puzzle_gallery_cache.rs index a6b9eb7d..adb24caf 100644 --- a/server-rs/crates/api-server/src/puzzle_gallery_cache.rs +++ b/server-rs/crates/api-server/src/puzzle_gallery_cache.rs @@ -10,7 +10,7 @@ use shared_contracts::{ puzzle_works::PuzzleWorkSummaryResponse, }; use tokio::{ - sync::{Mutex, MutexGuard, RwLock}, + sync::{Mutex, MutexGuard, OwnedMutexGuard, RwLock}, time, }; @@ -69,6 +69,18 @@ impl PuzzleGalleryCache { }) } + pub async fn read_stale_response(&self) -> Option { + let guard = self.inner.read().await; + let entry = guard.as_ref()?; + Some(PuzzleGalleryCachedResponse { + data_json: entry.data_json.clone(), + }) + } + + pub fn try_acquire_owned_rebuild_guard(&self) -> Option> { + self.rebuild_lock.clone().try_lock_owned().ok() + } + pub async fn store_response( &self, response: PuzzleGalleryResponse, @@ -205,4 +217,36 @@ mod tests { assert!(!response.has_more); assert_eq!(response.next_cursor, None); } + + #[tokio::test] + async fn stale_response_remains_readable_after_fresh_ttl() { + let cache = PuzzleGalleryCache::new(); + let response = + build_puzzle_gallery_window_response((0..8).map(build_summary).collect::>()); + cache + .store_response(response) + .await + .expect("cache response should serialize"); + + { + let mut guard = cache.inner.write().await; + let entry = guard.as_mut().expect("cache entry should exist"); + entry.built_at = Instant::now() - PUZZLE_GALLERY_CACHE_TTL - Duration::from_secs(1); + } + + assert!(cache.read_fresh_response().await.is_none()); + assert!(cache.read_stale_response().await.is_some()); + } + + #[tokio::test] + async fn try_owned_rebuild_guard_allows_only_one_refresher() { + let cache = PuzzleGalleryCache::new(); + let first_guard = cache.try_acquire_owned_rebuild_guard(); + + assert!(first_guard.is_some()); + assert!(cache.try_acquire_owned_rebuild_guard().is_none()); + + drop(first_guard); + assert!(cache.try_acquire_owned_rebuild_guard().is_some()); + } } diff --git a/server-rs/crates/api-server/src/state.rs b/server-rs/crates/api-server/src/state.rs index 2e2e690e..5ae4244f 100644 --- a/server-rs/crates/api-server/src/state.rs +++ b/server-rs/crates/api-server/src/state.rs @@ -6,6 +6,7 @@ use std::{ time::{SystemTime, UNIX_EPOCH}, }; +use axum::extract::FromRef; use module_ai::{AiTaskService, InMemoryAiTaskStore}; use module_auth::{ AuthUserService, InMemoryAuthStore, PasswordEntryService, PhoneAuthService, @@ -39,13 +40,113 @@ const ADMIN_ROLE: &str = "admin"; pub type HttpRequestPermitPool = Semaphore; -// 当前阶段先保留最小共享状态壳,后续逐步接入配置、客户端与平台适配。 +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum HttpRequestPermitPoolKind { + Default, + Gallery, + Detail, + Admin, +} + +impl HttpRequestPermitPoolKind { + pub fn as_str(self) -> &'static str { + match self { + Self::Default => "default", + Self::Gallery => "gallery", + Self::Detail => "detail", + Self::Admin => "admin", + } + } +} + #[derive(Clone, Debug)] -pub struct AppState { +pub struct HttpRequestPermitPools { + default: Option>, + gallery: Option>, + detail: Option>, + admin: Option>, +} + +impl HttpRequestPermitPools { + fn from_config(config: &AppConfig) -> Self { + Self { + default: config + .max_concurrent_requests + .map(HttpRequestPermitPool::new) + .map(Arc::new), + gallery: config + .gallery_max_concurrent_requests + .map(HttpRequestPermitPool::new) + .map(Arc::new), + detail: config + .detail_max_concurrent_requests + .map(HttpRequestPermitPool::new) + .map(Arc::new), + admin: config + .admin_max_concurrent_requests + .map(HttpRequestPermitPool::new) + .map(Arc::new), + } + } + + pub fn pool( + &self, + kind: HttpRequestPermitPoolKind, + ) -> Option<(HttpRequestPermitPoolKind, Arc)> { + let selected = match kind { + HttpRequestPermitPoolKind::Default => self.default.clone(), + HttpRequestPermitPoolKind::Gallery => self.gallery.clone(), + HttpRequestPermitPoolKind::Detail => self.detail.clone(), + HttpRequestPermitPoolKind::Admin => self.admin.clone(), + }; + selected.map(|pool| (kind, pool)).or_else(|| { + self.default + .clone() + .map(|pool| (HttpRequestPermitPoolKind::Default, pool)) + }) + } +} + +#[derive(Clone, Debug)] +pub struct BackpressureState { + permit_pools: HttpRequestPermitPools, +} + +impl BackpressureState { + pub fn request_permit_pool( + &self, + kind: HttpRequestPermitPoolKind, + ) -> Option<(HttpRequestPermitPoolKind, Arc)> { + self.permit_pools.pool(kind) + } +} + +#[derive(Clone, Debug)] +pub struct AppState(Arc); + +impl std::ops::Deref for AppState { + type Target = AppStateInner; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl FromRef for BackpressureState { + fn from_ref(state: &AppState) -> Self { + Self { + permit_pools: state.http_request_permit_pools(), + } + } +} + +// Axum/Hyper 会在路由树和连接 service 上频繁 clone state;AppState 外层必须保持浅拷贝。 +#[derive(Debug)] +pub struct AppStateInner { // 配置会在后续中间件、路由和平台适配接入时逐步消费。 #[allow(dead_code)] pub config: AppConfig, - http_request_permit_pool: Option>, + http_request_permit_pools: HttpRequestPermitPools, auth_jwt_config: JwtConfig, admin_runtime: Option, refresh_cookie_config: RefreshCookieConfig, @@ -198,14 +299,11 @@ impl AppState { }); let llm_client = build_llm_client(&config)?; let creative_agent_gpt5_client = build_creative_agent_gpt5_client(&config)?; - let http_request_permit_pool = config - .max_concurrent_requests - .map(HttpRequestPermitPool::new) - .map(Arc::new); + let http_request_permit_pools = HttpRequestPermitPools::from_config(&config); - Ok(Self { + Ok(Self(Arc::new(AppStateInner { config, - http_request_permit_pool, + http_request_permit_pools, auth_jwt_config, admin_runtime, refresh_cookie_config, @@ -232,7 +330,7 @@ impl AppState { creative_agent_sessions: Arc::new(Mutex::new(HashMap::new())), #[cfg(test)] test_runtime_snapshot_store: Arc::new(Mutex::new(HashMap::new())), - }) + }))) } pub fn auth_jwt_config(&self) -> &JwtConfig { @@ -247,8 +345,8 @@ impl AppState { &self.refresh_cookie_config } - pub fn http_request_permit_pool(&self) -> Option> { - self.http_request_permit_pool.clone() + pub fn http_request_permit_pools(&self) -> HttpRequestPermitPools { + self.http_request_permit_pools.clone() } pub async fn upsert_creation_entry_type_config( diff --git a/server-rs/crates/api-server/src/telemetry.rs b/server-rs/crates/api-server/src/telemetry.rs index 39643976..9c40fdd7 100644 --- a/server-rs/crates/api-server/src/telemetry.rs +++ b/server-rs/crates/api-server/src/telemetry.rs @@ -12,10 +12,14 @@ use std::sync::{ }; use tracing::{info, warn}; -use crate::{request_context::resolve_request_id, state::AppState}; +use crate::{ + request_context::resolve_request_id, + state::{AppState, HttpRequestPermitPoolKind}, +}; static HTTP_RESPONSE_BODY_IN_FLIGHT: AtomicI64 = AtomicI64::new(0); -static HTTP_REQUEST_PERMITS_AVAILABLE: OnceLock> = OnceLock::new(); +static HTTP_REQUEST_PERMITS_AVAILABLE: OnceLock = + OnceLock::new(); // 集中维护 api-server HTTP 观测,避免在 handler 中散落高基数字段或重复创建 instrument。 pub async fn record_http_observability( @@ -78,29 +82,42 @@ pub async fn record_http_observability( track_response_body_in_flight(response) } -pub(crate) fn update_http_request_permits_available(available: usize) { - let gauge = HTTP_REQUEST_PERMITS_AVAILABLE.get_or_init(|| { - let gauge = Arc::new(AtomicI64::new(0)); - register_http_request_permits_available_metric(gauge.clone()); - gauge - }); - gauge.store(available.min(i64::MAX as usize) as i64, Ordering::Relaxed); +pub(crate) fn update_http_request_permits_available( + pool: HttpRequestPermitPoolKind, + available: usize, +) { + HTTP_REQUEST_PERMITS_AVAILABLE + .get_or_init(register_http_request_permits_available_metric) + .store(pool, available); } pub(crate) fn record_puzzle_gallery_cache_hit() { puzzle_gallery_cache_metrics().hits.add(1, &[]); } +pub(crate) fn record_puzzle_gallery_cache_stale_hit() { + puzzle_gallery_cache_metrics().stale_hits.add(1, &[]); +} + pub(crate) fn record_puzzle_gallery_cache_miss() { puzzle_gallery_cache_metrics().misses.add(1, &[]); } -pub(crate) fn record_puzzle_gallery_cache_rebuild(duration: std::time::Duration, data_bytes: usize) { +pub(crate) fn record_puzzle_gallery_cache_refresh_started() { + puzzle_gallery_cache_metrics().refreshes_started.add(1, &[]); +} + +pub(crate) fn record_puzzle_gallery_cache_refresh_failed() { + puzzle_gallery_cache_metrics().refreshes_failed.add(1, &[]); +} + +pub(crate) fn record_puzzle_gallery_cache_rebuild( + duration: std::time::Duration, + data_bytes: usize, +) { let metrics = puzzle_gallery_cache_metrics(); metrics.rebuilds.add(1, &[]); - metrics - .rebuild_duration - .record(duration.as_secs_f64(), &[]); + metrics.rebuild_duration.record(duration.as_secs_f64(), &[]); metrics .data_json_bytes .record(data_bytes.min(u64::MAX as usize) as u64, &[]); @@ -125,12 +142,44 @@ struct HttpMetrics { struct PuzzleGalleryCacheMetrics { hits: Counter, + stale_hits: Counter, misses: Counter, + refreshes_started: Counter, + refreshes_failed: Counter, rebuilds: Counter, rebuild_duration: opentelemetry::metrics::Histogram, data_json_bytes: opentelemetry::metrics::Histogram, } +struct HttpRequestPermitsAvailableGauges { + default: Arc, + gallery: Arc, + detail: Arc, + admin: Arc, +} + +impl HttpRequestPermitsAvailableGauges { + fn new() -> Self { + Self { + default: Arc::new(AtomicI64::new(0)), + gallery: Arc::new(AtomicI64::new(0)), + detail: Arc::new(AtomicI64::new(0)), + admin: Arc::new(AtomicI64::new(0)), + } + } + + fn store(&self, pool: HttpRequestPermitPoolKind, available: usize) { + let value = available.min(i64::MAX as usize) as i64; + match pool { + HttpRequestPermitPoolKind::Default => &self.default, + HttpRequestPermitPoolKind::Gallery => &self.gallery, + HttpRequestPermitPoolKind::Detail => &self.detail, + HttpRequestPermitPoolKind::Admin => &self.admin, + } + .store(value, Ordering::Relaxed); + } +} + struct ResponseBodyInFlightGuard; impl Drop for ResponseBodyInFlightGuard { @@ -171,10 +220,22 @@ fn puzzle_gallery_cache_metrics() -> &'static PuzzleGalleryCacheMetrics { .u64_counter("genarrative.puzzle_gallery.cache.hits") .with_description("Puzzle gallery response cache hits") .build(), + stale_hits: meter + .u64_counter("genarrative.puzzle_gallery.cache.stale_hits") + .with_description("Puzzle gallery stale response cache hits") + .build(), misses: meter .u64_counter("genarrative.puzzle_gallery.cache.misses") .with_description("Puzzle gallery response cache misses") .build(), + refreshes_started: meter + .u64_counter("genarrative.puzzle_gallery.cache.refreshes_started") + .with_description("Puzzle gallery background refresh start count") + .build(), + refreshes_failed: meter + .u64_counter("genarrative.puzzle_gallery.cache.refreshes_failed") + .with_description("Puzzle gallery background refresh failure count") + .build(), rebuilds: meter .u64_counter("genarrative.puzzle_gallery.cache.rebuilds") .with_description("Puzzle gallery response cache rebuild count") @@ -193,16 +254,49 @@ fn puzzle_gallery_cache_metrics() -> &'static PuzzleGalleryCacheMetrics { }) } -fn register_http_request_permits_available_metric(gauge: Arc) { +fn register_http_request_permits_available_metric() -> HttpRequestPermitsAvailableGauges { + let gauges = HttpRequestPermitsAvailableGauges::new(); let meter = global::meter("genarrative-api"); + let default_gauge = gauges.default.clone(); + let gallery_gauge = gauges.gallery.clone(); + let detail_gauge = gauges.detail.clone(); + let admin_gauge = gauges.admin.clone(); meter .i64_observable_up_down_counter("genarrative.http.server.request_permits.available") .with_unit("{permit}") .with_description("Available api-server HTTP backpressure permits") .with_callback(move |observer| { - observer.observe(gauge.load(Ordering::Relaxed), &[]); + observer.observe( + default_gauge.load(Ordering::Relaxed), + &[KeyValue::new( + "pool", + HttpRequestPermitPoolKind::Default.as_str(), + )], + ); + observer.observe( + gallery_gauge.load(Ordering::Relaxed), + &[KeyValue::new( + "pool", + HttpRequestPermitPoolKind::Gallery.as_str(), + )], + ); + observer.observe( + detail_gauge.load(Ordering::Relaxed), + &[KeyValue::new( + "pool", + HttpRequestPermitPoolKind::Detail.as_str(), + )], + ); + observer.observe( + admin_gauge.load(Ordering::Relaxed), + &[KeyValue::new( + "pool", + HttpRequestPermitPoolKind::Admin.as_str(), + )], + ); }) .build(); + gauges } pub(crate) fn register_http_runtime_metrics() { @@ -284,19 +378,13 @@ mod tests { observability_route("/api/runtime/puzzle/runs/run-123/history"), "/api/*" ); - assert_eq!( - observability_route("/admin/api/debug/http"), - "/admin/api/*" - ); + assert_eq!(observability_route("/admin/api/debug/http"), "/admin/api/*"); } #[test] fn resolve_request_scheme_uses_forwarded_proto_first_value() { let mut headers = HeaderMap::new(); - headers.insert( - "x-forwarded-proto", - HeaderValue::from_static("https, http"), - ); + headers.insert("x-forwarded-proto", HeaderValue::from_static("https, http")); assert_eq!(resolve_request_scheme(&headers), "https"); } diff --git a/server-rs/crates/module-auth/src/lib.rs b/server-rs/crates/module-auth/src/lib.rs index 9aabbda4..7d1c3111 100644 --- a/server-rs/crates/module-auth/src/lib.rs +++ b/server-rs/crates/module-auth/src/lib.rs @@ -2060,6 +2060,7 @@ fn map_sms_provider_error_to_phone_error(error: SmsProviderError) -> PhoneAuthEr SmsProviderError::InvalidConfig(message) => { PhoneAuthError::SmsProviderInvalidConfig(message) } + SmsProviderError::InvalidVerifyCode => PhoneAuthError::InvalidVerifyCode, SmsProviderError::Upstream(message) => PhoneAuthError::SmsProviderUpstream(message), } } From 05a0f347224b4951d7d1b770cf9435aa7c4f5395 Mon Sep 17 00:00:00 2001 From: kdletters <61648117+kdletters@users.noreply.github.com> Date: Tue, 19 May 2026 01:47:13 +0800 Subject: [PATCH 3/6] perf(api-server): batch route tracking through local outbox --- .hermes/shared-memory/decision-log.md | 8 + .hermes/shared-memory/pitfalls.md | 8 + deploy/container/api-server.Dockerfile | 5 +- deploy/container/api-server.env.example | 5 + deploy/container/docker-compose.loadtest.yml | 2 + deploy/env/api-server.env.example | 5 + ...】server-rs与SpacetimeDB数据契约-2026-05-15.md | 2 + ...发运维】本地开发验证与生产运维-2026-05-15.md | 14 +- scripts/jenkins-server-provision.sh | 2 +- server-rs/crates/api-server/Cargo.toml | 2 +- server-rs/crates/api-server/src/config.rs | 59 ++ server-rs/crates/api-server/src/main.rs | 6 +- server-rs/crates/api-server/src/state.rs | 8 + server-rs/crates/api-server/src/telemetry.rs | 121 ++++ server-rs/crates/api-server/src/tracking.rs | 129 +++- .../crates/api-server/src/tracking_outbox.rs | 594 ++++++++++++++++++ server-rs/crates/module-runtime/src/domain.rs | 8 + .../crates/spacetime-client/src/mapper.rs | 27 + .../src/module_bindings/mod.rs | 4 + ...rd_tracking_events_and_return_procedure.rs | 59 ++ ...cking_event_batch_procedure_result_type.rs | 17 + .../crates/spacetime-client/src/runtime.rs | 29 + .../spacetime-module/src/runtime/profile.rs | 40 ++ 23 files changed, 1131 insertions(+), 23 deletions(-) create mode 100644 server-rs/crates/api-server/src/tracking_outbox.rs create mode 100644 server-rs/crates/spacetime-client/src/module_bindings/record_tracking_events_and_return_procedure.rs create mode 100644 server-rs/crates/spacetime-client/src/module_bindings/runtime_tracking_event_batch_procedure_result_type.rs diff --git a/.hermes/shared-memory/decision-log.md b/.hermes/shared-memory/decision-log.md index 060e73c7..0c062b55 100644 --- a/.hermes/shared-memory/decision-log.md +++ b/.hermes/shared-memory/decision-log.md @@ -551,3 +551,11 @@ - 影响范围:用户侧任务中心、后台任务配置、运营查询、埋点查询、钱包流水。 - 验证方式:非 `user` scope 的个人任务配置应被 API 和领域构造层拒绝;任务查询与埋点查询分别放在 `docs/operations/` 和 `docs/tracking/`。 - 关联文档:`PROFILE_TASK_AND_TRACKING_SYSTEM_2026-05-03.md`、`RUNTIME_PROFILE_TASK_SCOPE_2026-05-04.md`、`ANALYTICS_DATE_DIMENSION_IMPLEMENTATION_2026-05-04.md`。 + +## 普通 route tracking 先写本机 outbox 再批量入库 + +- 背景:公开作品列表压测中,成功响应后的全局 route tracking 会逐条调用 SpacetimeDB,导致数据库内存和事务压力先到边界。 +- 决策:普通 HTTP route tracking 先写入 `api-server` 本机 NDJSON outbox,后台按数量或时间阈值批量调用 SpacetimeDB;`daily_login`、`work_play_start`、支付、任务领奖、钱包等关键事件保持同步直写。 +- 默认阈值:每批 500 条或 1 秒 flush 一次;outbox 磁盘上限 256 MiB,超过后丢弃低价值 route 事件并记录指标 / 日志。 +- 影响范围:`api-server` tracking 中间件、SpacetimeDB tracking procedure、部署数据目录、OTLP 指标和运维排障。 +- 验证方式:数据库不可用时公开 route 请求不失败且 outbox 文件保留;恢复后批量写入成功并删除本地 sealed 文件;关键事件仍立即影响任务 / 统计。 diff --git a/.hermes/shared-memory/pitfalls.md b/.hermes/shared-memory/pitfalls.md index ccc90e04..f6a9e8d6 100644 --- a/.hermes/shared-memory/pitfalls.md +++ b/.hermes/shared-memory/pitfalls.md @@ -848,6 +848,14 @@ - 验证:宿主机 k6 打 `http://127.0.0.1:18080`,`PEAK_RPS=1000` 等价约 2000 HTTP req/s;320 档无 dropped iterations、无 5xx、无 OOM,200 请求 `request_time p95` 约 0.292s。336 / 352 档 p95 升到约 0.31s / 0.32s,SpacetimeDB 内存尾部可到约 `880MiB / 896MiB`。 - 关联:`deploy/container/nginx.conf`、`deploy/container/api-server.env.example`、`deploy/container/README.md`、`server-rs/crates/api-server/src/tracking.rs`。 +## tracking outbox 成功入库后删除 sealed 文件 + +- 现象:普通 route tracking 改为本机 outbox 后,容易误以为入库成功只需要清空文件内容。 +- 原因:清空文件会扩大崩溃窗口,进程在 truncate 和确认之间异常退出时可能丢失未确认事件。 +- 处理:当前 active NDJSON 达到数量或时间阈值后原子 rename 为 sealed 文件;后台批量 flush sealed 文件,SpacetimeDB 返回成功后直接删除该文件,失败则保留文件等待重试。sealed 文件如果出现无法解析的坏行,重命名为 `corrupt-*` 隔离并记录指标,避免阻塞后续批量入库。该路径是至少一次投递,重复事件由 `tracking_event.event_id` 幂等跳过。 +- 验证:模拟 SpacetimeDB 不可用时 sealed 文件保留;恢复后批量 procedure 成功,sealed 文件消失,`tracking_event` 与 `tracking_daily_stat` 均更新。 +- 关联:`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`、`server-rs/crates/api-server/src/tracking.rs`、`server-rs/crates/spacetime-module/src/runtime/profile.rs`。 + ## 后台表查询展示 SpacetimeDB 枚举时不要套用 Option 解码 - 现象:后台“表查询”查看 `profile_recharge_order` 时,`kind` 和 `status` 显示为空数组 `[]`,例如充值订单原始行里 `points_60` 的类型和状态都不可读。 diff --git a/deploy/container/api-server.Dockerfile b/deploy/container/api-server.Dockerfile index 40897357..1a0c1eaa 100644 --- a/deploy/container/api-server.Dockerfile +++ b/deploy/container/api-server.Dockerfile @@ -15,7 +15,7 @@ RUN apt-get update && \ COPY --from=rust-builder /tmp/api-server /usr/local/bin/api-server -RUN mkdir -p /var/lib/genarrative/auth && \ +RUN mkdir -p /var/lib/genarrative/auth /var/lib/genarrative/tracking-outbox && \ chown -R genarrative:genarrative /srv/genarrative /var/lib/genarrative USER genarrative @@ -24,7 +24,8 @@ EXPOSE 8082 ENV GENARRATIVE_ENV=container \ GENARRATIVE_API_HOST=0.0.0.0 \ GENARRATIVE_API_PORT=8082 \ - GENARRATIVE_AUTH_STORE_PATH=/var/lib/genarrative/auth/auth-store.json + GENARRATIVE_AUTH_STORE_PATH=/var/lib/genarrative/auth/auth-store.json \ + GENARRATIVE_TRACKING_OUTBOX_DIR=/var/lib/genarrative/tracking-outbox CMD ["api-server"] diff --git a/deploy/container/api-server.env.example b/deploy/container/api-server.env.example index e2fad8c5..6c559c0e 100644 --- a/deploy/container/api-server.env.example +++ b/deploy/container/api-server.env.example @@ -12,6 +12,11 @@ GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512 GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320 GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=64 GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16 +GENARRATIVE_TRACKING_OUTBOX_ENABLED=true +GENARRATIVE_TRACKING_OUTBOX_DIR=/var/lib/genarrative/tracking-outbox +GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE=500 +GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS=1000 +GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES=268435456 GENARRATIVE_OTEL_ENABLED=false OTEL_SERVICE_NAME=genarrative-api diff --git a/deploy/container/docker-compose.loadtest.yml b/deploy/container/docker-compose.loadtest.yml index 29b6b73e..c7e00cbc 100644 --- a/deploy/container/docker-compose.loadtest.yml +++ b/deploy/container/docker-compose.loadtest.yml @@ -53,6 +53,7 @@ services: - "host.docker.internal:host-gateway" volumes: - api-auth-store:/var/lib/genarrative/auth + - api-tracking-outbox:/var/lib/genarrative/tracking-outbox ulimits: nofile: soft: 4096 @@ -138,4 +139,5 @@ services: volumes: spacetime-data: api-auth-store: + api-tracking-outbox: nginx-logs: diff --git a/deploy/env/api-server.env.example b/deploy/env/api-server.env.example index 373f142d..bd265993 100644 --- a/deploy/env/api-server.env.example +++ b/deploy/env/api-server.env.example @@ -11,6 +11,11 @@ GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512 GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=64 GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=32 GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16 +GENARRATIVE_TRACKING_OUTBOX_ENABLED=true +GENARRATIVE_TRACKING_OUTBOX_DIR=/var/lib/genarrative/tracking-outbox +GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE=500 +GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS=1000 +GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES=268435456 GENARRATIVE_OTEL_ENABLED=false OTEL_SERVICE_NAME=genarrative-api OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318 diff --git a/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md b/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md index 2ac833a4..decb4f96 100644 --- a/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md +++ b/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md @@ -596,11 +596,13 @@ npm run check:server-rs-ddd - Rust 结构体:`TrackingDailyStat` - 源码:`server-rs/crates/spacetime-module/src/runtime/profile.rs` +- 写入:由单条或批量 tracking procedure 在同一事务中随 `tracking_event` 更新,作为运营查询和个人任务进度的聚合投影。 ### `tracking_event` - Rust 结构体:`TrackingEvent` - 源码:`server-rs/crates/spacetime-module/src/runtime/profile.rs` +- 写入:关键业务埋点同步调用单条 procedure;普通 HTTP route tracking 由 `api-server` 本机 outbox 批量调用 `record_tracking_events_and_return`。`event_id` 必须稳定且全局唯一,批量重试时用唯一索引做幂等跳过。 ### `treasure_record` diff --git a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md index 6ef7fe0b..0e1e3ad1 100644 --- a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md +++ b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md @@ -233,7 +233,7 @@ cargo test -p platform-auth --manifest-path server-rs/Cargo.toml aliyun_send_sms ## 埋点与运营查询 -用户行为埋点原始事实写入 `tracking_event`,聚合投影写入 `tracking_daily_stat`。任务配置、进度、领奖、钱包流水分别写入: +用户行为埋点原始事实写入 `tracking_event`,聚合投影写入 `tracking_daily_stat`。高频 HTTP route tracking 不直接阻塞请求链路:`api-server` 将普通 route tracking 先写入本机 tracking outbox,再由后台 worker 按数量或时间阈值批量写入 SpacetimeDB;`daily_login`、作品游玩 `work_play_start`、付费、任务领奖和钱包相关关键事件继续同步直写数据库,避免用户任务进度、游玩统计或支付状态出现可感知延迟。任务配置、进度、领奖、钱包流水分别写入: - `profile_task_config` - `profile_task_progress` @@ -242,6 +242,18 @@ cargo test -p platform-auth --manifest-path server-rs/Cargo.toml aliyun_send_sms 个人任务首版 scope 仅支持 `user`。后台、RPG、大鱼吃小鱼、Visual Novel、Story、Combat 等特定链路按 tracking 中间件排除规则处理;作品游玩统一使用 `work_play_start`。 +tracking outbox 默认配置: + +```env +GENARRATIVE_TRACKING_OUTBOX_ENABLED=true +GENARRATIVE_TRACKING_OUTBOX_DIR=/var/lib/genarrative/tracking-outbox +GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE=500 +GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS=1000 +GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES=268435456 +``` + +outbox 采用 NDJSON 文件保存原始事件。达到 `BATCH_SIZE` 或 `FLUSH_INTERVAL_MS` 任一阈值后,当前 active 文件会被原子切换为 sealed 文件并进入批量 flush;SpacetimeDB 批量 procedure 返回成功后删除 sealed 文件,失败则保留文件并重试。`MAX_BYTES` 是磁盘保护阈值,不是 flush 阈值;超过后低价值 route tracking 可以被丢弃并记录日志 / 指标,关键同步事件不进入该丢弃路径。sealed 文件若出现无法解析的坏行,会重命名为 `corrupt-*` 隔离并记录 `genarrative.tracking_outbox.files.corrupt` 指标,避免一个坏文件阻塞后续批量入库。该机制提供至少一次投递语义,依赖 `tracking_event.event_id` 幂等跳过重复事件。 + 常用检查思路: ```sql diff --git a/scripts/jenkins-server-provision.sh b/scripts/jenkins-server-provision.sh index bbabf2bd..aeb0db57 100755 --- a/scripts/jenkins-server-provision.sh +++ b/scripts/jenkins-server-provision.sh @@ -569,7 +569,7 @@ echo "[server-provision] target=${DEPLOY_TARGET}, dry_run=${DRY_RUN}, nginx_conf run_cmd id install_build_dependencies install_sccache -run_cmd mkdir -p "${SPACETIME_ROOT}" "${RELEASE_ROOT}" "$(dirname "${CURRENT_LINK}")" "$(dirname "${WEB_LINK}")" /etc/genarrative /var/lib/genarrative/maintenance /var/lib/genarrative/auth +run_cmd mkdir -p "${SPACETIME_ROOT}" "${RELEASE_ROOT}" "$(dirname "${CURRENT_LINK}")" "$(dirname "${WEB_LINK}")" /etc/genarrative /var/lib/genarrative/maintenance /var/lib/genarrative/auth /var/lib/genarrative/tracking-outbox if ! id spacetimedb >/dev/null 2>&1; then run_cmd useradd --system --home-dir "${SPACETIME_ROOT}" --shell /usr/sbin/nologin spacetimedb diff --git a/server-rs/crates/api-server/Cargo.toml b/server-rs/crates/api-server/Cargo.toml index ce4ef1e6..b423be50 100644 --- a/server-rs/crates/api-server/Cargo.toml +++ b/server-rs/crates/api-server/Cargo.toml @@ -46,7 +46,7 @@ shared-kernel = { workspace = true } shared-logging = { workspace = true } socket2 = { workspace = true } spacetime-client = { workspace = true } -tokio = { workspace = true, features = ["macros", "rt-multi-thread", "net", "time", "sync"] } +tokio = { workspace = true, features = ["macros", "rt-multi-thread", "net", "time", "sync", "fs", "io-util"] } tokio-stream = { workspace = true } futures-util = { workspace = true } time = { workspace = true, features = ["formatting"] } diff --git a/server-rs/crates/api-server/src/config.rs b/server-rs/crates/api-server/src/config.rs index e2e965f6..041df9b5 100644 --- a/server-rs/crates/api-server/src/config.rs +++ b/server-rs/crates/api-server/src/config.rs @@ -26,6 +26,11 @@ pub struct AppConfig { pub gallery_max_concurrent_requests: Option, pub detail_max_concurrent_requests: Option, pub admin_max_concurrent_requests: Option, + pub tracking_outbox_enabled: bool, + pub tracking_outbox_dir: PathBuf, + pub tracking_outbox_batch_size: usize, + pub tracking_outbox_flush_interval: Duration, + pub tracking_outbox_max_bytes: u64, pub log_filter: String, pub otel_enabled: bool, pub admin_username: Option, @@ -160,6 +165,11 @@ impl Default for AppConfig { gallery_max_concurrent_requests: None, detail_max_concurrent_requests: None, admin_max_concurrent_requests: None, + tracking_outbox_enabled: true, + tracking_outbox_dir: PathBuf::from("server-rs/.data/tracking-outbox"), + tracking_outbox_batch_size: 500, + tracking_outbox_flush_interval: Duration::from_millis(1_000), + tracking_outbox_max_bytes: 256 * 1024 * 1024, log_filter: "info,tower_http=info".to_string(), otel_enabled: false, admin_username: None, @@ -343,6 +353,26 @@ impl AppConfig { { config.admin_max_concurrent_requests = Some(max_concurrent_requests); } + if let Some(enabled) = read_first_bool_env(&["GENARRATIVE_TRACKING_OUTBOX_ENABLED"]) { + config.tracking_outbox_enabled = enabled; + } + if let Some(dir) = read_first_non_empty_env(&["GENARRATIVE_TRACKING_OUTBOX_DIR"]) { + config.tracking_outbox_dir = PathBuf::from(dir); + } + if let Some(batch_size) = read_first_usize_env(&["GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE"]) + { + config.tracking_outbox_batch_size = batch_size; + } + if let Some(flush_interval_ms) = + read_first_positive_u64_env(&["GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS"]) + { + config.tracking_outbox_flush_interval = Duration::from_millis(flush_interval_ms); + } + if let Some(max_bytes) = + read_first_positive_u64_env(&["GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES"]) + { + config.tracking_outbox_max_bytes = max_bytes; + } if let Some(otel_enabled) = read_first_bool_env(&["GENARRATIVE_OTEL_ENABLED"]) { config.otel_enabled = otel_enabled; } @@ -1230,6 +1260,11 @@ mod tests { std::env::remove_var("GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS"); std::env::remove_var("GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS"); std::env::remove_var("GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_ENABLED"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_DIR"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES"); std::env::remove_var("GENARRATIVE_OTEL_ENABLED"); std::env::set_var("GENARRATIVE_API_LISTEN_BACKLOG", "2048"); std::env::set_var("GENARRATIVE_API_WORKER_THREADS", "6"); @@ -1237,6 +1272,14 @@ mod tests { std::env::set_var("GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS", "64"); std::env::set_var("GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS", "32"); std::env::set_var("GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS", "16"); + std::env::set_var("GENARRATIVE_TRACKING_OUTBOX_ENABLED", "false"); + std::env::set_var( + "GENARRATIVE_TRACKING_OUTBOX_DIR", + "/tmp/genarrative-tracking-outbox", + ); + std::env::set_var("GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE", "250"); + std::env::set_var("GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS", "2000"); + std::env::set_var("GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES", "1048576"); std::env::set_var("GENARRATIVE_OTEL_ENABLED", "true"); } @@ -1247,6 +1290,17 @@ mod tests { assert_eq!(config.gallery_max_concurrent_requests, Some(64)); assert_eq!(config.detail_max_concurrent_requests, Some(32)); assert_eq!(config.admin_max_concurrent_requests, Some(16)); + assert!(!config.tracking_outbox_enabled); + assert_eq!( + config.tracking_outbox_dir, + std::path::PathBuf::from("/tmp/genarrative-tracking-outbox") + ); + assert_eq!(config.tracking_outbox_batch_size, 250); + assert_eq!( + config.tracking_outbox_flush_interval, + std::time::Duration::from_millis(2_000) + ); + assert_eq!(config.tracking_outbox_max_bytes, 1_048_576); assert!(config.otel_enabled); unsafe { @@ -1256,6 +1310,11 @@ mod tests { std::env::remove_var("GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS"); std::env::remove_var("GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS"); std::env::remove_var("GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_ENABLED"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_DIR"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS"); + std::env::remove_var("GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES"); std::env::remove_var("GENARRATIVE_OTEL_ENABLED"); } } diff --git a/server-rs/crates/api-server/src/main.rs b/server-rs/crates/api-server/src/main.rs index 665f3526..01ed6555 100644 --- a/server-rs/crates/api-server/src/main.rs +++ b/server-rs/crates/api-server/src/main.rs @@ -55,8 +55,8 @@ mod password_entry; mod password_management; mod phone_auth; mod platform_errors; -mod profile_identity; mod process_metrics; +mod profile_identity; mod prompt; mod puzzle; mod puzzle_agent_turn; @@ -80,6 +80,7 @@ mod story_battles; mod story_sessions; mod telemetry; mod tracking; +mod tracking_outbox; mod vector_engine_audio_generation; mod visual_novel; mod volcengine_speech; @@ -154,6 +155,9 @@ async fn run_server(config: AppConfig) -> Result<(), io::Error> { .await .map_err(|error| std::io::Error::other(format!("初始化应用状态失败:{error}")))?; state.puzzle_gallery_cache().spawn_cleanup_task(); + if let Some(outbox) = state.tracking_outbox() { + outbox.spawn_worker(); + } let router = build_router(state); info!( diff --git a/server-rs/crates/api-server/src/state.rs b/server-rs/crates/api-server/src/state.rs index 5ae4244f..9249e4e5 100644 --- a/server-rs/crates/api-server/src/state.rs +++ b/server-rs/crates/api-server/src/state.rs @@ -33,6 +33,7 @@ use tracing::{info, warn}; use crate::config::AppConfig; use crate::puzzle_gallery_cache::PuzzleGalleryCache; +use crate::tracking_outbox::TrackingOutbox; use crate::wechat_pay::{WechatPayClient, map_wechat_pay_init_error}; use crate::wechat_provider::build_wechat_provider; @@ -167,6 +168,7 @@ pub struct AppStateInner { ai_task_service: AiTaskService, spacetime_client: SpacetimeClient, puzzle_gallery_cache: PuzzleGalleryCache, + tracking_outbox: Option>, llm_client: Option, creative_agent_gpt5_client: Option, creative_agent_executor: Arc, @@ -297,6 +299,7 @@ impl AppState { pool_size: config.spacetime_pool_size, procedure_timeout: config.spacetime_procedure_timeout, }); + let tracking_outbox = TrackingOutbox::from_config(&config, spacetime_client.clone()); let llm_client = build_llm_client(&config)?; let creative_agent_gpt5_client = build_creative_agent_gpt5_client(&config)?; let http_request_permit_pools = HttpRequestPermitPools::from_config(&config); @@ -324,6 +327,7 @@ impl AppState { ai_task_service, spacetime_client, puzzle_gallery_cache: PuzzleGalleryCache::new(), + tracking_outbox, llm_client, creative_agent_gpt5_client, creative_agent_executor: Arc::new(MockLangChainRustAgentExecutor), @@ -582,6 +586,10 @@ impl AppState { &self.puzzle_gallery_cache } + pub fn tracking_outbox(&self) -> Option> { + self.tracking_outbox.clone() + } + pub fn llm_client(&self) -> Option<&LlmClient> { self.llm_client.as_ref() } diff --git a/server-rs/crates/api-server/src/telemetry.rs b/server-rs/crates/api-server/src/telemetry.rs index 9c40fdd7..8c217634 100644 --- a/server-rs/crates/api-server/src/telemetry.rs +++ b/server-rs/crates/api-server/src/telemetry.rs @@ -18,6 +18,8 @@ use crate::{ }; static HTTP_RESPONSE_BODY_IN_FLIGHT: AtomicI64 = AtomicI64::new(0); +static TRACKING_OUTBOX_PENDING_BYTES: AtomicI64 = AtomicI64::new(0); +static TRACKING_OUTBOX_PENDING_FILES: AtomicI64 = AtomicI64::new(0); static HTTP_REQUEST_PERMITS_AVAILABLE: OnceLock = OnceLock::new(); @@ -123,6 +125,53 @@ pub(crate) fn record_puzzle_gallery_cache_rebuild( .record(data_bytes.min(u64::MAX as usize) as u64, &[]); } +pub(crate) fn record_tracking_outbox_enqueued() { + tracking_outbox_metrics().enqueued.add(1, &[]); +} + +pub(crate) fn record_tracking_outbox_dropped(reason: &'static str) { + tracking_outbox_metrics() + .dropped + .add(1, &[KeyValue::new("reason", reason)]); +} + +pub(crate) fn record_tracking_outbox_sealed(reason: &'static str) { + tracking_outbox_metrics() + .sealed_files + .add(1, &[KeyValue::new("reason", reason)]); +} + +pub(crate) fn record_tracking_outbox_corrupt_file() { + tracking_outbox_metrics().corrupt_files.add(1, &[]); +} + +pub(crate) fn record_tracking_outbox_flush( + duration: std::time::Duration, + accepted_count: u32, + file_bytes: u64, + failed: bool, +) { + let status_class = if failed { "error" } else { "ok" }; + let labels = [KeyValue::new("status_class", status_class)]; + let metrics = tracking_outbox_metrics(); + metrics.flushes.add(1, &labels); + metrics + .flush_duration + .record(duration.as_secs_f64(), &labels); + metrics + .flushed_events + .add(u64::from(accepted_count), &labels); + metrics.flushed_bytes.add(file_bytes, &labels); +} + +pub(crate) fn update_tracking_outbox_pending_bytes(bytes: u64) { + TRACKING_OUTBOX_PENDING_BYTES.store(bytes.min(i64::MAX as u64) as i64, Ordering::Relaxed); +} + +pub(crate) fn update_tracking_outbox_pending_files(files: usize) { + TRACKING_OUTBOX_PENDING_FILES.store(files.min(i64::MAX as usize) as i64, Ordering::Relaxed); +} + fn track_response_body_in_flight(response: Response) -> Response { response.map(|body| { HTTP_RESPONSE_BODY_IN_FLIGHT.fetch_add(1, Ordering::Relaxed); @@ -151,6 +200,17 @@ struct PuzzleGalleryCacheMetrics { data_json_bytes: opentelemetry::metrics::Histogram, } +struct TrackingOutboxMetrics { + enqueued: Counter, + dropped: Counter, + sealed_files: Counter, + corrupt_files: Counter, + flushes: Counter, + flush_duration: opentelemetry::metrics::Histogram, + flushed_events: Counter, + flushed_bytes: Counter, +} + struct HttpRequestPermitsAvailableGauges { default: Arc, gallery: Arc, @@ -254,6 +314,51 @@ fn puzzle_gallery_cache_metrics() -> &'static PuzzleGalleryCacheMetrics { }) } +fn tracking_outbox_metrics() -> &'static TrackingOutboxMetrics { + static METRICS: std::sync::OnceLock = std::sync::OnceLock::new(); + METRICS.get_or_init(|| { + let meter = global::meter("genarrative-api"); + TrackingOutboxMetrics { + enqueued: meter + .u64_counter("genarrative.tracking_outbox.events.enqueued") + .with_description("Tracking events appended to the local outbox") + .build(), + dropped: meter + .u64_counter("genarrative.tracking_outbox.events.dropped") + .with_description("Tracking events dropped by local outbox protection") + .build(), + sealed_files: meter + .u64_counter("genarrative.tracking_outbox.files.sealed") + .with_description("Tracking outbox active files sealed for flushing") + .build(), + corrupt_files: meter + .u64_counter("genarrative.tracking_outbox.files.corrupt") + .with_description( + "Tracking outbox sealed files quarantined because they could not be parsed", + ) + .build(), + flushes: meter + .u64_counter("genarrative.tracking_outbox.flushes") + .with_description("Tracking outbox sealed file flush attempts") + .build(), + flush_duration: meter + .f64_histogram("genarrative.tracking_outbox.flush.duration") + .with_unit("s") + .with_description("Tracking outbox sealed file flush duration") + .build(), + flushed_events: meter + .u64_counter("genarrative.tracking_outbox.events.flushed") + .with_description("Tracking events accepted by SpacetimeDB batch procedure") + .build(), + flushed_bytes: meter + .u64_counter("genarrative.tracking_outbox.bytes.flushed") + .with_unit("By") + .with_description("Tracking outbox bytes removed after successful flush") + .build(), + } + }) +} + fn register_http_request_permits_available_metric() -> HttpRequestPermitsAvailableGauges { let gauges = HttpRequestPermitsAvailableGauges::new(); let meter = global::meter("genarrative-api"); @@ -311,6 +416,22 @@ pub(crate) fn register_http_runtime_metrics() { observer.observe(HTTP_RESPONSE_BODY_IN_FLIGHT.load(Ordering::Relaxed), &[]); }) .build(); + meter + .i64_observable_up_down_counter("genarrative.tracking_outbox.pending.bytes") + .with_unit("By") + .with_description("Tracking outbox bytes waiting on local disk") + .with_callback(|observer| { + observer.observe(TRACKING_OUTBOX_PENDING_BYTES.load(Ordering::Relaxed), &[]); + }) + .build(); + meter + .i64_observable_up_down_counter("genarrative.tracking_outbox.pending.files") + .with_unit("{file}") + .with_description("Tracking outbox sealed files waiting for flush") + .with_callback(|observer| { + observer.observe(TRACKING_OUTBOX_PENDING_FILES.load(Ordering::Relaxed), &[]); + }) + .build(); }); } diff --git a/server-rs/crates/api-server/src/tracking.rs b/server-rs/crates/api-server/src/tracking.rs index 0f3aad21..ad3b187c 100644 --- a/server-rs/crates/api-server/src/tracking.rs +++ b/server-rs/crates/api-server/src/tracking.rs @@ -85,7 +85,7 @@ pub async fn record_route_tracking_event_after_success( draft.owner_user_id = draft.user_id.clone(); } - record_tracking_event_after_success(state, request_context, draft).await; + record_route_tracking_event_via_outbox_after_success(state, request_context, draft).await; } fn resolve_route_tracking_spec(method: &Method, path: &str) -> Option { @@ -524,26 +524,101 @@ pub async fn record_tracking_event_after_success( request_context: &RequestContext, draft: TrackingEventDraft, ) { - let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000; - let event_id = build_tracking_event_id(&draft, occurred_at_micros); - let event_key = draft.event_key.to_string(); - let scope_kind = draft.scope_kind; - let scope_id = draft.scope_id; - let metadata_json = draft.metadata.to_string(); + record_tracking_event_input_after_success( + state, + request_context, + build_tracking_event_input(draft), + ) + .await; +} + +async fn record_route_tracking_event_via_outbox_after_success( + state: &AppState, + request_context: &RequestContext, + draft: TrackingEventDraft, +) { + let event = build_tracking_event_input(draft); + let event_key = event.event_key.clone(); + let scope_kind = event.scope_kind; + let scope_id = event.scope_id.clone(); + + if let Some(outbox) = state.tracking_outbox() { + match outbox.enqueue(event.clone()).await { + Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Enqueued) => { + tracing::debug!( + request_id = request_context.request_id(), + operation = request_context.operation(), + event_key = %event_key, + scope_kind = %scope_kind.as_str(), + scope_id = %scope_id, + "后端 route 埋点已写入本机 outbox" + ); + return; + } + Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Dropped { reason }) => { + tracing::warn!( + request_id = request_context.request_id(), + operation = request_context.operation(), + event_key = %event_key, + scope_kind = %scope_kind.as_str(), + scope_id = %scope_id, + reason, + "后端 route 埋点因 outbox 保护阈值被丢弃,主业务流程继续" + ); + return; + } + Err(error) => { + tracing::warn!( + request_id = request_context.request_id(), + operation = request_context.operation(), + event_key = %event_key, + scope_kind = %scope_kind.as_str(), + scope_id = %scope_id, + error = %error, + "后端 route 埋点写入 outbox 失败,回退同步直写 SpacetimeDB" + ); + } + } + } + + record_tracking_event_input_after_success(state, request_context, event).await; +} + +async fn record_tracking_event_input_after_success( + state: &AppState, + request_context: &RequestContext, + event: module_runtime::RuntimeTrackingEventInput, +) { + let event_key = event.event_key.clone(); + let log_scope_kind = event.scope_kind; + let scope_id = event.scope_id.clone(); + + let module_runtime::RuntimeTrackingEventInput { + event_id, + event_key: procedure_event_key, + scope_kind: procedure_scope_kind, + scope_id: procedure_scope_id, + user_id, + owner_user_id, + profile_id, + module_key, + metadata_json, + occurred_at_micros, + } = event; match state .spacetime_client() .record_tracking_event( event_id, - event_key.clone(), - scope_kind, - scope_id.clone(), - draft.user_id, - draft.owner_user_id, - draft.profile_id, - draft.module_key.map(str::to_string), + procedure_event_key, + procedure_scope_kind, + procedure_scope_id, + user_id, + owner_user_id, + profile_id, + module_key, metadata_json, - occurred_at_micros as i64, + occurred_at_micros, ) .await { @@ -551,7 +626,7 @@ pub async fn record_tracking_event_after_success( request_id = request_context.request_id(), operation = request_context.operation(), event_key = %event_key, - scope_kind = %scope_kind.as_str(), + scope_kind = %log_scope_kind.as_str(), scope_id = %scope_id, "后端埋点已记录" ), @@ -559,7 +634,7 @@ pub async fn record_tracking_event_after_success( request_id = request_context.request_id(), operation = request_context.operation(), event_key = %event_key, - scope_kind = %scope_kind.as_str(), + scope_kind = %log_scope_kind.as_str(), scope_id = %scope_id, error = %error, "后端埋点记录失败,主业务流程继续" @@ -567,6 +642,26 @@ pub async fn record_tracking_event_after_success( } } +fn build_tracking_event_input( + draft: TrackingEventDraft, +) -> module_runtime::RuntimeTrackingEventInput { + let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000; + let event_id = build_tracking_event_id(&draft, occurred_at_micros); + + module_runtime::RuntimeTrackingEventInput { + event_id, + event_key: draft.event_key.to_string(), + scope_kind: draft.scope_kind, + scope_id: draft.scope_id, + user_id: draft.user_id, + owner_user_id: draft.owner_user_id, + profile_id: draft.profile_id, + module_key: draft.module_key.map(str::to_string), + metadata_json: draft.metadata.to_string(), + occurred_at_micros: occurred_at_micros as i64, + } +} + fn build_tracking_event_id(draft: &TrackingEventDraft, occurred_at_micros: i128) -> String { if draft.event_key == "daily_login" && draft.scope_kind == RuntimeTrackingScopeKind::User diff --git a/server-rs/crates/api-server/src/tracking_outbox.rs b/server-rs/crates/api-server/src/tracking_outbox.rs new file mode 100644 index 00000000..19a61ed6 --- /dev/null +++ b/server-rs/crates/api-server/src/tracking_outbox.rs @@ -0,0 +1,594 @@ +use std::{ + fmt, + path::{Path, PathBuf}, + sync::Arc, + time::{Duration, Instant, SystemTime, UNIX_EPOCH}, +}; + +use module_runtime::RuntimeTrackingEventInput; +use serde::{Deserialize, Serialize}; +use spacetime_client::{SpacetimeClient, SpacetimeClientError}; +use tokio::{ + fs::{self, File, OpenOptions}, + io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, + sync::Mutex, + time::sleep, +}; +use tracing::{debug, warn}; + +use crate::config::AppConfig; + +const ACTIVE_FILE_NAME: &str = "active.ndjson"; +const SEALED_FILE_PREFIX: &str = "sealed-"; +const CORRUPT_FILE_PREFIX: &str = "corrupt-"; +const SEALED_FILE_EXTENSION: &str = ".ndjson"; + +#[derive(Clone)] +pub struct TrackingOutbox { + dir: PathBuf, + batch_size: usize, + flush_interval: Duration, + max_bytes: u64, + spacetime_client: SpacetimeClient, + inner: Arc>, +} + +struct TrackingOutboxInner { + initialized: bool, + active_file: Option, + active_count: usize, + active_bytes: u64, + total_bytes: u64, + last_sealed_at: Instant, +} + +#[derive(Debug)] +pub enum TrackingOutboxEnqueueOutcome { + Enqueued, + Dropped { reason: &'static str }, +} + +#[derive(Debug)] +pub enum TrackingOutboxError { + Io(std::io::Error), + Json(serde_json::Error), + Spacetime(SpacetimeClientError), +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct TrackingOutboxRecord { + event: RuntimeTrackingEventInput, +} + +impl TrackingOutbox { + pub fn from_config(config: &AppConfig, spacetime_client: SpacetimeClient) -> Option> { + if !config.tracking_outbox_enabled { + return None; + } + + let total_bytes = directory_size_if_exists(&config.tracking_outbox_dir).unwrap_or(0); + let outbox = Self { + dir: config.tracking_outbox_dir.clone(), + batch_size: config.tracking_outbox_batch_size.max(1), + flush_interval: config.tracking_outbox_flush_interval, + max_bytes: config.tracking_outbox_max_bytes, + spacetime_client, + inner: Arc::new(Mutex::new(TrackingOutboxInner { + initialized: false, + active_file: None, + active_count: 0, + active_bytes: 0, + total_bytes, + last_sealed_at: Instant::now(), + })), + }; + crate::telemetry::update_tracking_outbox_pending_bytes(total_bytes); + Some(Arc::new(outbox)) + } + + pub async fn enqueue( + &self, + event: RuntimeTrackingEventInput, + ) -> Result { + let record = TrackingOutboxRecord { event }; + let mut line = serde_json::to_vec(&record)?; + line.push(b'\n'); + let line_bytes = line.len().min(u64::MAX as usize) as u64; + + let mut inner = self.inner.lock().await; + self.ensure_initialized_locked(&mut inner).await?; + + if inner.total_bytes.saturating_add(line_bytes) > self.max_bytes { + crate::telemetry::record_tracking_outbox_dropped("max_bytes"); + return Ok(TrackingOutboxEnqueueOutcome::Dropped { + reason: "max_bytes", + }); + } + + let active_path = self.active_path(); + if inner.active_file.is_none() { + inner.active_file = Some( + OpenOptions::new() + .create(true) + .append(true) + .open(&active_path) + .await?, + ); + } + + let file = inner + .active_file + .as_mut() + .expect("active file should be open before append"); + file.write_all(&line).await?; + inner.active_count = inner.active_count.saturating_add(1); + inner.active_bytes = inner.active_bytes.saturating_add(line_bytes); + inner.total_bytes = inner.total_bytes.saturating_add(line_bytes); + crate::telemetry::record_tracking_outbox_enqueued(); + crate::telemetry::update_tracking_outbox_pending_bytes(inner.total_bytes); + + if inner.active_count >= self.batch_size { + self.seal_active_locked(&mut inner, "batch_size").await?; + } + + Ok(TrackingOutboxEnqueueOutcome::Enqueued) + } + + pub fn spawn_worker(self: Arc) { + tokio::spawn(async move { + loop { + sleep(self.flush_interval).await; + if let Err(error) = self.seal_active_if_due().await { + warn!(error = %error, "tracking outbox 定时封存 active 文件失败"); + } + if let Err(error) = self.flush_sealed_files_once().await { + warn!(error = %error, "tracking outbox 批量写入 SpacetimeDB 失败,将保留 sealed 文件等待重试"); + } + } + }); + } + + async fn seal_active_if_due(&self) -> Result<(), TrackingOutboxError> { + let mut inner = self.inner.lock().await; + self.ensure_initialized_locked(&mut inner).await?; + if inner.active_count == 0 || inner.last_sealed_at.elapsed() < self.flush_interval { + return Ok(()); + } + + self.seal_active_locked(&mut inner, "flush_interval").await + } + + async fn flush_sealed_files_once(&self) -> Result<(), TrackingOutboxError> { + self.ensure_initialized().await?; + + let sealed_files = self.list_sealed_files().await?; + crate::telemetry::update_tracking_outbox_pending_files(sealed_files.len()); + for path in sealed_files { + let started_at = Instant::now(); + let metadata = fs::metadata(&path).await?; + let file_bytes = metadata.len(); + let events = match read_outbox_events(&path).await { + Ok(events) => events, + Err(error) if error.is_data_corruption() => { + let corrupt_path = self.corrupt_path_for(&path); + fs::rename(&path, &corrupt_path).await?; + self.subtract_total_bytes(file_bytes).await; + crate::telemetry::record_tracking_outbox_corrupt_file(); + warn!( + error = %error, + source = %path.display(), + target = %corrupt_path.display(), + "tracking outbox sealed 文件含无法解析的记录,已隔离并继续处理后续文件" + ); + continue; + } + Err(error) => return Err(error), + }; + if events.is_empty() { + fs::remove_file(&path).await?; + self.subtract_total_bytes(file_bytes).await; + continue; + } + + match self.spacetime_client.record_tracking_events(events).await { + Ok(accepted_count) => { + fs::remove_file(&path).await?; + self.subtract_total_bytes(file_bytes).await; + crate::telemetry::record_tracking_outbox_flush( + started_at.elapsed(), + accepted_count, + file_bytes, + false, + ); + debug!( + accepted_count, + file_bytes, + path = %path.display(), + "tracking outbox sealed 文件已批量入库并删除" + ); + } + Err(error) => { + crate::telemetry::record_tracking_outbox_flush( + started_at.elapsed(), + 0, + file_bytes, + true, + ); + return Err(TrackingOutboxError::Spacetime(error)); + } + } + } + + Ok(()) + } + + async fn ensure_initialized(&self) -> Result<(), TrackingOutboxError> { + let mut inner = self.inner.lock().await; + self.ensure_initialized_locked(&mut inner).await + } + + async fn ensure_initialized_locked( + &self, + inner: &mut TrackingOutboxInner, + ) -> Result<(), TrackingOutboxError> { + if inner.initialized { + return Ok(()); + } + + fs::create_dir_all(&self.dir).await?; + self.seal_existing_active_file().await?; + inner.total_bytes = directory_size(&self.dir).await?; + inner.initialized = true; + inner.last_sealed_at = Instant::now(); + crate::telemetry::update_tracking_outbox_pending_bytes(inner.total_bytes); + Ok(()) + } + + async fn seal_active_locked( + &self, + inner: &mut TrackingOutboxInner, + reason: &'static str, + ) -> Result<(), TrackingOutboxError> { + if inner.active_count == 0 && inner.active_bytes == 0 { + return Ok(()); + } + + if let Some(mut file) = inner.active_file.take() { + file.flush().await?; + file.sync_data().await?; + drop(file); + } + + let active_path = self.active_path(); + match fs::metadata(&active_path).await { + Ok(metadata) if metadata.len() > 0 => { + let sealed_path = self.next_sealed_path(); + fs::rename(&active_path, &sealed_path).await?; + crate::telemetry::record_tracking_outbox_sealed(reason); + debug!( + reason, + event_count = inner.active_count, + file_bytes = metadata.len(), + path = %sealed_path.display(), + "tracking outbox active 文件已封存" + ); + } + Ok(_) => { + let _ = fs::remove_file(&active_path).await; + } + Err(error) if error.kind() == std::io::ErrorKind::NotFound => {} + Err(error) => return Err(error.into()), + } + + inner.active_count = 0; + inner.active_bytes = 0; + inner.last_sealed_at = Instant::now(); + Ok(()) + } + + async fn seal_existing_active_file(&self) -> Result<(), TrackingOutboxError> { + let active_path = self.active_path(); + match fs::metadata(&active_path).await { + Ok(metadata) if metadata.len() > 0 => { + fs::rename(&active_path, self.next_sealed_path()).await?; + crate::telemetry::record_tracking_outbox_sealed("startup"); + } + Ok(_) => { + let _ = fs::remove_file(&active_path).await; + } + Err(error) if error.kind() == std::io::ErrorKind::NotFound => {} + Err(error) => return Err(error.into()), + } + Ok(()) + } + + async fn list_sealed_files(&self) -> Result, TrackingOutboxError> { + let mut entries = fs::read_dir(&self.dir).await?; + let mut files = Vec::new(); + while let Some(entry) = entries.next_entry().await? { + let path = entry.path(); + let Some(name) = path.file_name().and_then(|value| value.to_str()) else { + continue; + }; + if name.starts_with(SEALED_FILE_PREFIX) && name.ends_with(SEALED_FILE_EXTENSION) { + files.push(path); + } + } + files.sort(); + Ok(files) + } + + async fn subtract_total_bytes(&self, bytes: u64) { + let mut inner = self.inner.lock().await; + inner.total_bytes = inner.total_bytes.saturating_sub(bytes); + crate::telemetry::update_tracking_outbox_pending_bytes(inner.total_bytes); + } + + fn active_path(&self) -> PathBuf { + self.dir.join(ACTIVE_FILE_NAME) + } + + fn next_sealed_path(&self) -> PathBuf { + self.dir.join(format!( + "{SEALED_FILE_PREFIX}{}-{uuid}{SEALED_FILE_EXTENSION}", + current_unix_micros(), + uuid = uuid::Uuid::new_v4() + )) + } + + fn corrupt_path_for(&self, path: &Path) -> PathBuf { + let name = path + .file_name() + .and_then(|value| value.to_str()) + .unwrap_or("unknown.ndjson"); + self.dir.join(format!( + "{CORRUPT_FILE_PREFIX}{}-{uuid}-{name}", + current_unix_micros(), + uuid = uuid::Uuid::new_v4() + )) + } +} + +impl fmt::Debug for TrackingOutbox { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TrackingOutbox") + .field("dir", &self.dir) + .field("batch_size", &self.batch_size) + .field("flush_interval", &self.flush_interval) + .field("max_bytes", &self.max_bytes) + .finish() + } +} + +impl fmt::Display for TrackingOutboxError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Io(error) => write!(f, "{error}"), + Self::Json(error) => write!(f, "{error}"), + Self::Spacetime(error) => write!(f, "{error}"), + } + } +} + +impl From for TrackingOutboxError { + fn from(value: std::io::Error) -> Self { + Self::Io(value) + } +} + +impl From for TrackingOutboxError { + fn from(value: serde_json::Error) -> Self { + Self::Json(value) + } +} + +impl TrackingOutboxError { + fn is_data_corruption(&self) -> bool { + matches!(self, Self::Json(_)) + } +} + +async fn read_outbox_events( + path: &Path, +) -> Result, TrackingOutboxError> { + let file = File::open(path).await?; + let mut lines = BufReader::new(file).lines(); + let mut events = Vec::new(); + while let Some(line) = lines.next_line().await? { + if line.trim().is_empty() { + continue; + } + let record = serde_json::from_str::(&line)?; + events.push(record.event); + } + Ok(events) +} + +async fn directory_size(path: &Path) -> Result { + let mut total = 0u64; + let mut entries = fs::read_dir(path).await?; + while let Some(entry) = entries.next_entry().await? { + if !is_pending_outbox_file_name(&entry.file_name()) { + continue; + } + let metadata = entry.metadata().await?; + if metadata.is_file() { + total = total.saturating_add(metadata.len()); + } + } + Ok(total) +} + +fn directory_size_if_exists(path: &Path) -> Result { + if !path.is_dir() { + return Ok(0); + } + + let mut total = 0u64; + for entry in std::fs::read_dir(path)? { + let entry = entry?; + if !is_pending_outbox_file_name(&entry.file_name()) { + continue; + } + let metadata = entry.metadata()?; + if metadata.is_file() { + total = total.saturating_add(metadata.len()); + } + } + Ok(total) +} + +fn current_unix_micros() -> u128 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_micros() +} + +fn is_pending_outbox_file_name(name: &std::ffi::OsStr) -> bool { + name.to_str().is_some_and(|value| { + value == ACTIVE_FILE_NAME + || (value.starts_with(SEALED_FILE_PREFIX) && value.ends_with(SEALED_FILE_EXTENSION)) + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_event(event_id: &str) -> RuntimeTrackingEventInput { + RuntimeTrackingEventInput { + event_id: event_id.to_string(), + event_key: "puzzle_route_success".to_string(), + scope_kind: module_runtime::RuntimeTrackingScopeKind::Site, + scope_id: "site".to_string(), + user_id: None, + owner_user_id: None, + profile_id: None, + module_key: Some("puzzle".to_string()), + metadata_json: "{}".to_string(), + occurred_at_micros: 1_713_680_000_000_000, + } + } + + fn test_dir(name: &str) -> PathBuf { + let dir = std::env::temp_dir().join(format!( + "genarrative-tracking-outbox-{name}-{}", + current_unix_micros() + )); + let _ = std::fs::remove_dir_all(&dir); + dir + } + + fn test_outbox(dir: PathBuf, batch_size: usize, max_bytes: u64) -> Arc { + let config = AppConfig { + tracking_outbox_dir: dir, + tracking_outbox_batch_size: batch_size, + tracking_outbox_max_bytes: max_bytes, + tracking_outbox_flush_interval: Duration::from_secs(60), + ..AppConfig::default() + }; + TrackingOutbox::from_config( + &config, + SpacetimeClient::new(spacetime_client::SpacetimeClientConfig { + server_url: "http://127.0.0.1:1".to_string(), + database: "missing".to_string(), + token: None, + pool_size: 1, + procedure_timeout: Duration::from_millis(10), + }), + ) + .expect("outbox should be enabled") + } + + #[tokio::test] + async fn enqueue_seals_active_file_when_batch_size_reached() { + let dir = test_dir("batch"); + let outbox = test_outbox(dir.clone(), 2, 1024 * 1024); + + outbox.enqueue(sample_event("event-1")).await.unwrap(); + outbox.enqueue(sample_event("event-2")).await.unwrap(); + + assert!(!dir.join(ACTIVE_FILE_NAME).exists()); + let sealed_count = std::fs::read_dir(&dir) + .unwrap() + .filter_map(Result::ok) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with(SEALED_FILE_PREFIX)) + }) + .count(); + assert_eq!(sealed_count, 1); + + let _ = std::fs::remove_dir_all(dir); + } + + #[tokio::test] + async fn enqueue_drops_when_outbox_exceeds_max_bytes() { + let dir = test_dir("max-bytes"); + let outbox = test_outbox(dir.clone(), 500, 1); + + let outcome = outbox.enqueue(sample_event("event-1")).await.unwrap(); + + assert!(matches!( + outcome, + TrackingOutboxEnqueueOutcome::Dropped { + reason: "max_bytes" + } + )); + assert!(!dir.join(ACTIVE_FILE_NAME).exists()); + + let _ = std::fs::remove_dir_all(dir); + } + + #[tokio::test] + async fn flush_quarantines_corrupt_sealed_file() { + let dir = test_dir("corrupt"); + std::fs::create_dir_all(&dir).unwrap(); + let sealed_path = dir.join(format!("{SEALED_FILE_PREFIX}bad{SEALED_FILE_EXTENSION}")); + std::fs::write(&sealed_path, b"{not-json}\n").unwrap(); + let outbox = test_outbox(dir.clone(), 500, 1024 * 1024); + + outbox.flush_sealed_files_once().await.unwrap(); + + assert!(!sealed_path.exists()); + let corrupt_count = std::fs::read_dir(&dir) + .unwrap() + .filter_map(Result::ok) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with(CORRUPT_FILE_PREFIX)) + }) + .count(); + assert_eq!(corrupt_count, 1); + + let _ = std::fs::remove_dir_all(dir); + } + + #[test] + fn directory_size_excludes_quarantined_corrupt_files() { + let dir = test_dir("directory-size"); + std::fs::create_dir_all(&dir).unwrap(); + std::fs::write(dir.join(ACTIVE_FILE_NAME), b"active").unwrap(); + std::fs::write( + dir.join(format!("{SEALED_FILE_PREFIX}one{SEALED_FILE_EXTENSION}")), + b"sealed", + ) + .unwrap(); + std::fs::write( + dir.join(format!("{CORRUPT_FILE_PREFIX}one{SEALED_FILE_EXTENSION}")), + b"corrupt", + ) + .unwrap(); + + let total = directory_size_if_exists(&dir).unwrap(); + + assert_eq!(total, 12); + + let _ = std::fs::remove_dir_all(dir); + } +} diff --git a/server-rs/crates/module-runtime/src/domain.rs b/server-rs/crates/module-runtime/src/domain.rs index a10f0cc2..4d1da0bc 100644 --- a/server-rs/crates/module-runtime/src/domain.rs +++ b/server-rs/crates/module-runtime/src/domain.rs @@ -706,6 +706,14 @@ pub struct RuntimeTrackingEventProcedureResult { pub error_message: Option, } +#[cfg_attr(feature = "spacetime-types", derive(SpacetimeType))] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct RuntimeTrackingEventBatchProcedureResult { + pub ok: bool, + pub accepted_count: u32, + pub error_message: Option, +} + #[cfg_attr(feature = "spacetime-types", derive(SpacetimeType))] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct RuntimeProfileTaskConfigSnapshot { diff --git a/server-rs/crates/spacetime-client/src/mapper.rs b/server-rs/crates/spacetime-client/src/mapper.rs index 6ee3b1ca..79b17304 100644 --- a/server-rs/crates/spacetime-client/src/mapper.rs +++ b/server-rs/crates/spacetime-client/src/mapper.rs @@ -246,6 +246,23 @@ impl From for RuntimeProfileTa } } +impl From for RuntimeTrackingEventInput { + fn from(input: module_runtime::RuntimeTrackingEventInput) -> Self { + Self { + event_id: input.event_id, + event_key: input.event_key, + scope_kind: map_runtime_tracking_scope_kind(input.scope_kind), + scope_id: input.scope_id, + user_id: input.user_id, + owner_user_id: input.owner_user_id, + profile_id: input.profile_id, + module_key: input.module_key, + metadata_json: input.metadata_json, + occurred_at_micros: input.occurred_at_micros, + } + } +} + impl From for AnalyticsMetricQueryInput { fn from(input: module_runtime::AnalyticsMetricQueryInput) -> Self { Self { @@ -1211,6 +1228,16 @@ pub(crate) fn map_runtime_tracking_event_procedure_result( Ok(()) } +pub(crate) fn map_runtime_tracking_event_batch_procedure_result( + result: RuntimeTrackingEventBatchProcedureResult, +) -> Result { + if !result.ok { + return Err(SpacetimeClientError::procedure_failed(result.error_message)); + } + + Ok(result.accepted_count) +} + pub(crate) fn map_runtime_profile_task_center_procedure_result( result: RuntimeProfileTaskCenterProcedureResult, ) -> Result { diff --git a/server-rs/crates/spacetime-client/src/module_bindings/mod.rs b/server-rs/crates/spacetime-client/src/module_bindings/mod.rs index 984ccd36..6a53dc72 100644 --- a/server-rs/crates/spacetime-client/src/module_bindings/mod.rs +++ b/server-rs/crates/spacetime-client/src/module_bindings/mod.rs @@ -624,6 +624,7 @@ pub mod record_custom_world_profile_play_procedure; pub mod record_daily_login_tracking_event_and_return_procedure; pub mod record_puzzle_work_like_procedure; pub mod record_tracking_event_and_return_procedure; +pub mod record_tracking_events_and_return_procedure; pub mod record_visual_novel_runtime_event_procedure; pub mod redeem_profile_referral_invite_code_procedure; pub mod redeem_profile_reward_code_procedure; @@ -764,6 +765,7 @@ pub mod runtime_snapshot_row_type; pub mod runtime_snapshot_table; pub mod runtime_snapshot_type; pub mod runtime_snapshot_upsert_input_type; +pub mod runtime_tracking_event_batch_procedure_result_type; pub mod runtime_tracking_event_input_type; pub mod runtime_tracking_event_procedure_result_type; pub mod runtime_tracking_scope_kind_type; @@ -1548,6 +1550,7 @@ pub use record_custom_world_profile_play_procedure::record_custom_world_profile_ pub use record_daily_login_tracking_event_and_return_procedure::record_daily_login_tracking_event_and_return; pub use record_puzzle_work_like_procedure::record_puzzle_work_like; pub use record_tracking_event_and_return_procedure::record_tracking_event_and_return; +pub use record_tracking_events_and_return_procedure::record_tracking_events_and_return; pub use record_visual_novel_runtime_event_procedure::record_visual_novel_runtime_event; pub use redeem_profile_referral_invite_code_procedure::redeem_profile_referral_invite_code; pub use redeem_profile_reward_code_procedure::redeem_profile_reward_code; @@ -1688,6 +1691,7 @@ pub use runtime_snapshot_row_type::RuntimeSnapshotRow; pub use runtime_snapshot_table::*; pub use runtime_snapshot_type::RuntimeSnapshot; pub use runtime_snapshot_upsert_input_type::RuntimeSnapshotUpsertInput; +pub use runtime_tracking_event_batch_procedure_result_type::RuntimeTrackingEventBatchProcedureResult; pub use runtime_tracking_event_input_type::RuntimeTrackingEventInput; pub use runtime_tracking_event_procedure_result_type::RuntimeTrackingEventProcedureResult; pub use runtime_tracking_scope_kind_type::RuntimeTrackingScopeKind; diff --git a/server-rs/crates/spacetime-client/src/module_bindings/record_tracking_events_and_return_procedure.rs b/server-rs/crates/spacetime-client/src/module_bindings/record_tracking_events_and_return_procedure.rs new file mode 100644 index 00000000..428e378f --- /dev/null +++ b/server-rs/crates/spacetime-client/src/module_bindings/record_tracking_events_and_return_procedure.rs @@ -0,0 +1,59 @@ +// THIS FILE IS AUTOMATICALLY GENERATED BY SPACETIMEDB. EDITS TO THIS FILE +// WILL NOT BE SAVED. MODIFY TABLES IN YOUR MODULE SOURCE CODE INSTEAD. + +#![allow(unused, clippy::all)] +use spacetimedb_sdk::__codegen::{self as __sdk, __lib, __sats, __ws}; + +use super::runtime_tracking_event_batch_procedure_result_type::RuntimeTrackingEventBatchProcedureResult; +use super::runtime_tracking_event_input_type::RuntimeTrackingEventInput; + +#[derive(__lib::ser::Serialize, __lib::de::Deserialize, Clone, PartialEq, Debug)] +#[sats(crate = __lib)] +struct RecordTrackingEventsAndReturnArgs { + pub inputs: Vec, +} + +impl __sdk::InModule for RecordTrackingEventsAndReturnArgs { + type Module = super::RemoteModule; +} + +#[allow(non_camel_case_types)] +/// Extension trait for access to the procedure `record_tracking_events_and_return`. +/// +/// Implemented for [`super::RemoteProcedures`]. +pub trait record_tracking_events_and_return { + fn record_tracking_events_and_return(&self, inputs: Vec) { + self.record_tracking_events_and_return_then(inputs, |_, _| {}); + } + + fn record_tracking_events_and_return_then( + &self, + inputs: Vec, + + __callback: impl FnOnce( + &super::ProcedureEventContext, + Result, + ) + Send + + 'static, + ); +} + +impl record_tracking_events_and_return for super::RemoteProcedures { + fn record_tracking_events_and_return_then( + &self, + inputs: Vec, + + __callback: impl FnOnce( + &super::ProcedureEventContext, + Result, + ) + Send + + 'static, + ) { + self.imp + .invoke_procedure_with_callback::<_, RuntimeTrackingEventBatchProcedureResult>( + "record_tracking_events_and_return", + RecordTrackingEventsAndReturnArgs { inputs }, + __callback, + ); + } +} diff --git a/server-rs/crates/spacetime-client/src/module_bindings/runtime_tracking_event_batch_procedure_result_type.rs b/server-rs/crates/spacetime-client/src/module_bindings/runtime_tracking_event_batch_procedure_result_type.rs new file mode 100644 index 00000000..1d4d72d2 --- /dev/null +++ b/server-rs/crates/spacetime-client/src/module_bindings/runtime_tracking_event_batch_procedure_result_type.rs @@ -0,0 +1,17 @@ +// THIS FILE IS AUTOMATICALLY GENERATED BY SPACETIMEDB. EDITS TO THIS FILE +// WILL NOT BE SAVED. MODIFY TABLES IN YOUR MODULE SOURCE CODE INSTEAD. + +#![allow(unused, clippy::all)] +use spacetimedb_sdk::__codegen::{self as __sdk, __lib, __sats, __ws}; + +#[derive(__lib::ser::Serialize, __lib::de::Deserialize, Clone, PartialEq, Debug)] +#[sats(crate = __lib)] +pub struct RuntimeTrackingEventBatchProcedureResult { + pub ok: bool, + pub accepted_count: u32, + pub error_message: Option, +} + +impl __sdk::InModule for RuntimeTrackingEventBatchProcedureResult { + type Module = super::RemoteModule; +} diff --git a/server-rs/crates/spacetime-client/src/runtime.rs b/server-rs/crates/spacetime-client/src/runtime.rs index baac3495..1b9429a7 100644 --- a/server-rs/crates/spacetime-client/src/runtime.rs +++ b/server-rs/crates/spacetime-client/src/runtime.rs @@ -585,6 +585,35 @@ impl SpacetimeClient { .await } + pub async fn record_tracking_events( + &self, + events: Vec, + ) -> Result { + if events.is_empty() { + return Ok(0); + } + + let procedure_inputs = events + .into_iter() + .map(crate::module_bindings::RuntimeTrackingEventInput::from) + .collect::>(); + + self.call_after_connect( + "record_tracking_events_and_return", + move |connection, sender| { + connection + .procedures() + .record_tracking_events_and_return_then(procedure_inputs, move |_, result| { + let mapped = result + .map_err(SpacetimeClientError::from_sdk_error) + .and_then(map_runtime_tracking_event_batch_procedure_result); + send_once(&sender, mapped); + }); + }, + ) + .await + } + pub async fn get_profile_task_center( &self, user_id: String, diff --git a/server-rs/crates/spacetime-module/src/runtime/profile.rs b/server-rs/crates/spacetime-module/src/runtime/profile.rs index 10f3c59e..d1bbb3c3 100644 --- a/server-rs/crates/spacetime-module/src/runtime/profile.rs +++ b/server-rs/crates/spacetime-module/src/runtime/profile.rs @@ -558,6 +558,33 @@ pub fn record_tracking_event_and_return( } } +// 高频 route tracking 由 api-server 本机 outbox 批量写入,减少公开列表热路径上的 procedure 调用次数。 +#[spacetimedb::procedure] +pub fn record_tracking_events_and_return( + ctx: &mut ProcedureContext, + inputs: Vec, +) -> RuntimeTrackingEventBatchProcedureResult { + match ctx.try_with_tx(|tx| { + let mut accepted_count = 0u32; + for input in &inputs { + record_tracking_event(tx, input.clone())?; + accepted_count = accepted_count.saturating_add(1); + } + Ok(accepted_count) + }) { + Ok(accepted_count) => RuntimeTrackingEventBatchProcedureResult { + ok: true, + accepted_count, + error_message: None, + }, + Err(message) => RuntimeTrackingEventBatchProcedureResult { + ok: false, + accepted_count: 0, + error_message: Some(message), + }, + } +} + // 登录成功埋点由认证链路主动调用;任务中心只负责读取和刷新任务进度。 #[spacetimedb::procedure] pub fn record_daily_login_tracking_event_and_return( @@ -1539,6 +1566,19 @@ mod tests { assert!(!should_skip_existing_tracking_event_id(false)); } + #[test] + fn tracking_batch_result_reports_accepted_count() { + let result = RuntimeTrackingEventBatchProcedureResult { + ok: true, + accepted_count: 2, + error_message: None, + }; + + assert!(result.ok); + assert_eq!(result.accepted_count, 2); + assert!(result.error_message.is_none()); + } + #[test] fn recent_public_work_play_counts_group_requested_profiles_in_window() { let now_micros = PUBLIC_WORK_PLAY_DAY_MICROS * 10; From fa43410c8c50064fb1ddcb279196a21c63ca441e Mon Sep 17 00:00:00 2001 From: kdletters <61648117+kdletters@users.noreply.github.com> Date: Tue, 19 May 2026 05:03:23 +0800 Subject: [PATCH 4/6] perf(deploy): tune gallery load shedding for release --- .hermes/shared-memory/decision-log.md | 8 ++++++++ deploy/container/README.md | 2 +- deploy/container/nginx.conf | 6 +++--- deploy/env/api-server.env.example | 4 ++-- deploy/nginx/genarrative-dev-http.conf | 10 +++++----- deploy/nginx/genarrative.conf | 10 +++++----- docs/【开发运维】本地开发验证与生产运维-2026-05-15.md | 6 +++--- 7 files changed, 27 insertions(+), 19 deletions(-) diff --git a/.hermes/shared-memory/decision-log.md b/.hermes/shared-memory/decision-log.md index 0c062b55..257f8405 100644 --- a/.hermes/shared-memory/decision-log.md +++ b/.hermes/shared-memory/decision-log.md @@ -35,6 +35,14 @@ - 验证方式:Jenkins 构建机可完成工具包准备,release 部署 agent 只消费工作区文件;目标机不再依赖 GitHub 外网下载。 - 关联文档:`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 +## 2026-05-19 公开 gallery 入口发布限流以快拒绝保护后端 + +- 背景:容器 2C / 2G 压测中,公开作品列表在约 5000 HTTP req/s 目标下可以保持 200 请求低延迟,但 SpacetimeDB 内存会随 api-server 重连和高压请求累积到容器上限附近。 +- 决策:发布配置采用公开 gallery list 专用入口限流:Nginx `genarrative_gallery_rps rate=5000r/s`、`burst=4096`、gallery list `limit_conn=320`;api-server 对应 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320`,公开详情维持更低的 `GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=64`。超过容量时接受明确 `429`,不继续扩大入口并发。 +- 影响范围:`deploy/nginx/` 发布模板、`deploy/env/api-server.env.example`、`deploy/container/` 隔离压测模板和生产运维文档。 +- 验证方式:容器连续 10 轮不重启 SpacetimeDB 压测,`PEAK_RPS=2500` 等价约 5000 HTTP req/s,平均实际吞吐约 `4219 HTTP req/s`,总计 `0` 个 5xx,200 请求平均 `p95=123ms`、`p99=234ms`;同时观察 SpacetimeDB 内存高水位,后续优化先处理连接 / 订阅 / tracking 下游状态。 +- 关联文档:`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`、`deploy/container/README.md`。 + ## 2026-05-16 公开作品列表短期由 BFF 订阅读模型缓存 - 背景:作品列表压测和实时性讨论中,曾考虑让浏览器前端直接订阅公开作品列表,减少 HTTP 拉取和 BFF 压力。 diff --git a/deploy/container/README.md b/deploy/container/README.md index 31ce88a3..3fa60fdf 100644 --- a/deploy/container/README.md +++ b/deploy/container/README.md @@ -132,7 +132,7 @@ spacetime publish genarrative-loadtest --server http://127.0.0.1:13101 --module- 如果要压 1000 HTTP req/s,把 `PEAK_RPS` 调到 `500`;如果要压 5000 HTTP req/s,把 `PEAK_RPS` 调到 `2500`,并同时提高 `PREALLOCATED_VUS` / `MAX_VUS`,观察是否先被带宽、Nginx `limit_conn` / `limit_req` 或 api-server 分组背压限制。当前容器 Nginx 对公开 gallery list 使用 `genarrative_gallery_rps`,公开详情和普通 API 使用 `genarrative_api_rps`,后台 API 使用 `genarrative_admin_rps`;api-server 侧对应 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS`、`GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS` 和 `GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS`。 -2026-05-19 的 2C / 2G 容器压测结论:公开 gallery list 的 `limit_conn=320` 与 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320` 是当前较稳的上限。用宿主机 k6 打 `http://127.0.0.1:18080`,`PEAK_RPS=1000` 等价于约 2000 HTTP req/s 的两接口组合压测;320 档无 dropped iterations、无 5xx、无 OOM,约 `151710` 个 200 与 `34310` 个 429,200 请求 `request_time p95=0.292s`。继续抬到 336 / 352 不会有效吃满 api-server CPU,反而让 200 数量减少、p95 升到约 0.31s / 0.32s,SpacetimeDB 内存尾部逼近 `880MiB / 896MiB`,下游内存先到危险区。当前不要为了降低“剩余 CPU”继续抬公开列表并发;下一步应减少成功列表请求后的 SpacetimeDB tracking 写入或优化下游状态,而不是放大入口并发。 +2026-05-19 的 2C / 2G 容器压测结论:公开 gallery list 的 `limit_conn=320`、`limit_req rate=5000r/s burst=4096` 与 `GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320` 是当前发布口径。用宿主机 k6 打 `http://127.0.0.1:18080`,`PEAK_RPS=2500` 等价于约 5000 HTTP req/s 的两接口组合压测;连续 10 轮不重启 SpacetimeDB 的平均实际吞吐约 `4219 HTTP req/s`,总计 `1,897,357` 个 200、`212,542` 个 429、`0` 个 5xx,200 请求平均 `p95=123ms`、`p99=234ms`。该档会让 SpacetimeDB 内存从约 `366MiB` 累积到约 `885MiB / 896MiB`,下游内存先到危险区。当前不要为了降低“剩余 CPU”继续抬公开列表并发;下一步应减少成功列表请求后的 SpacetimeDB tracking 写入或优化下游连接 / 订阅状态,而不是放大入口并发。 ### 内存采样 diff --git a/deploy/container/nginx.conf b/deploy/container/nginx.conf index d6f19c9c..2799af16 100644 --- a/deploy/container/nginx.conf +++ b/deploy/container/nginx.conf @@ -21,7 +21,7 @@ http { } limit_conn_zone $binary_remote_addr zone=genarrative_api_conn:10m; - limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=2400r/s; + limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=5000r/s; limit_req_zone $binary_remote_addr zone=genarrative_api_rps:10m rate=300r/s; limit_req_zone $binary_remote_addr zone=genarrative_admin_rps:10m rate=30r/s; @@ -91,7 +91,7 @@ http { location = /api/runtime/puzzle/gallery { default_type application/json; limit_conn genarrative_api_conn 320; - limit_req zone=genarrative_gallery_rps burst=256 nodelay; + limit_req zone=genarrative_gallery_rps burst=4096 nodelay; proxy_pass http://genarrative_api; proxy_http_version 1.1; @@ -111,7 +111,7 @@ http { location = /api/runtime/custom-world-gallery { default_type application/json; limit_conn genarrative_api_conn 320; - limit_req zone=genarrative_gallery_rps burst=256 nodelay; + limit_req zone=genarrative_gallery_rps burst=4096 nodelay; proxy_pass http://genarrative_api; proxy_http_version 1.1; diff --git a/deploy/env/api-server.env.example b/deploy/env/api-server.env.example index bd265993..d2c835b9 100644 --- a/deploy/env/api-server.env.example +++ b/deploy/env/api-server.env.example @@ -8,8 +8,8 @@ GENARRATIVE_API_LOG=info,tower_http=info GENARRATIVE_API_LISTEN_BACKLOG=1024 GENARRATIVE_API_WORKER_THREADS=4 GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512 -GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=64 -GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=32 +GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320 +GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=64 GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16 GENARRATIVE_TRACKING_OUTBOX_ENABLED=true GENARRATIVE_TRACKING_OUTBOX_DIR=/var/lib/genarrative/tracking-outbox diff --git a/deploy/nginx/genarrative-dev-http.conf b/deploy/nginx/genarrative-dev-http.conf index ed5ca13e..63234e30 100644 --- a/deploy/nginx/genarrative-dev-http.conf +++ b/deploy/nginx/genarrative-dev-http.conf @@ -14,7 +14,7 @@ upstream genarrative_api { } limit_conn_zone $binary_remote_addr zone=genarrative_api_conn:10m; -limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=650r/s; +limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=5000r/s; limit_req_zone $binary_remote_addr zone=genarrative_api_rps:10m rate=300r/s; limit_req_zone $binary_remote_addr zone=genarrative_admin_rps:10m rate=30r/s; @@ -93,8 +93,8 @@ server { location = /api/runtime/puzzle/gallery { default_type application/json; - limit_conn genarrative_api_conn 64; - limit_req zone=genarrative_gallery_rps burst=64 nodelay; + limit_conn genarrative_api_conn 320; + limit_req zone=genarrative_gallery_rps burst=4096 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; @@ -117,8 +117,8 @@ server { location = /api/runtime/custom-world-gallery { default_type application/json; - limit_conn genarrative_api_conn 64; - limit_req zone=genarrative_gallery_rps burst=64 nodelay; + limit_conn genarrative_api_conn 320; + limit_req zone=genarrative_gallery_rps burst=4096 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; diff --git a/deploy/nginx/genarrative.conf b/deploy/nginx/genarrative.conf index 788a1e0d..023a96f8 100644 --- a/deploy/nginx/genarrative.conf +++ b/deploy/nginx/genarrative.conf @@ -12,7 +12,7 @@ upstream genarrative_api { } limit_conn_zone $binary_remote_addr zone=genarrative_api_conn:10m; -limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=650r/s; +limit_req_zone $binary_remote_addr zone=genarrative_gallery_rps:10m rate=5000r/s; limit_req_zone $binary_remote_addr zone=genarrative_api_rps:10m rate=300r/s; limit_req_zone $binary_remote_addr zone=genarrative_admin_rps:10m rate=30r/s; @@ -113,8 +113,8 @@ server { location = /api/runtime/puzzle/gallery { default_type application/json; - limit_conn genarrative_api_conn 64; - limit_req zone=genarrative_gallery_rps burst=64 nodelay; + limit_conn genarrative_api_conn 320; + limit_req zone=genarrative_gallery_rps burst=4096 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; @@ -137,8 +137,8 @@ server { location = /api/runtime/custom-world-gallery { default_type application/json; - limit_conn genarrative_api_conn 64; - limit_req zone=genarrative_gallery_rps burst=64 nodelay; + limit_conn genarrative_api_conn 320; + limit_req zone=genarrative_gallery_rps burst=4096 nodelay; if ($genarrative_maintenance) { return 503 '{"ok":false,"error":{"code":"MAINTENANCE","message":"服务维护中"}}'; diff --git a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md index 0e1e3ad1..80f523da 100644 --- a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md +++ b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md @@ -154,14 +154,14 @@ Jenkins 按 web / api / Spacetime module / build / deploy / publish 拆分 50 HTTP req/s 首版压测优化口径: - `api-server` 生产模板默认 `GENARRATIVE_API_LISTEN_BACKLOG=1024`、`GENARRATIVE_API_WORKER_THREADS=4`;本地未设置 worker threads 时继续使用 Tokio 默认值。 -- `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512` 开启应用内 HTTP 并发背压;`GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=64`、`GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=32`、`GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16` 分别限制公开列表、公开详情和后台 API 热路径。超过许可时直接返回 `429 Too Many Requests` 和 `Retry-After: 1`,`/healthz` 不受该限制。这些值不是 RPS 限速;如果压测中 429 上升但内存和 p95 收敛,说明背压正在保护进程。直连 `api-server` 的极高 RPS 压测若出现 `connection refused`,通常已经打到 TCP 监听 / accept 层,应同时检查 backlog、Nginx upstream keepalive 和前置限流。 +- `GENARRATIVE_API_MAX_CONCURRENT_REQUESTS=512` 开启应用内 HTTP 并发背压;`GENARRATIVE_API_GALLERY_MAX_CONCURRENT_REQUESTS=320`、`GENARRATIVE_API_DETAIL_MAX_CONCURRENT_REQUESTS=64`、`GENARRATIVE_API_ADMIN_MAX_CONCURRENT_REQUESTS=16` 分别限制公开列表、公开详情和后台 API 热路径。超过许可时直接返回 `429 Too Many Requests` 和 `Retry-After: 1`,`/healthz` 不受该限制。这些值不是 RPS 限速;如果压测中 429 上升但内存和 p95 收敛,说明背压正在保护进程。直连 `api-server` 的极高 RPS 压测若出现 `connection refused`,通常已经打到 TCP 监听 / accept 层,应同时检查 backlog、Nginx upstream keepalive 和前置限流。 - `genarrative-api.service` 设置 `LimitNOFILE=65535`、`TasksMax=2048`;上线后用 `systemctl show genarrative-api.service -p LimitNOFILE -p TasksMax` 和 `cat /proc/$(pidof api-server)/limits` 核对。 - Server provision 不在目标机下载 SpacetimeDB 或 `otelcol-contrib`。Jenkins 的 `Prepare Provision Tools` 阶段在 `linux && genarrative-build` 构建机执行 `scripts/prepare-server-provision-tools.sh`,通过官方 SpacetimeDB 安装入口 `https://install.spacetimedb.com` 和 OpenTelemetry release 包生成 `provision-tools/`,再通过 `stash/unstash` 上传到 release 部署 agent。目标机上的 `scripts/jenkins-server-provision.sh` 只从该工作区工具包安装 `/stdb/spacetime`、`/stdb/bin/current/*` 和 `/usr/local/bin/otelcol-contrib`。 - `otelcol-contrib.service` 作为可选系统服务加入 provision,默认监听 `127.0.0.1:4317/4318` 并使用 `deploy/otelcol/genarrative-debug.yaml`。api-server 是否发送 OTLP 仍由 `GENARRATIVE_OTEL_ENABLED` 控制,服务 unit 见 `deploy/systemd/otelcol-contrib.service`。 -- Nginx `/api/` 与 `/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`,upstream keepalive 为 64;`limit_conn` 负责连接 / 并发保护,`limit_req` 负责入口 RPS 快拒绝。当前模板把公开 gallery list 单独放到 `genarrative_gallery_rps`,公开详情和普通 API 放到 `genarrative_api_rps`,后台 API 放到 `genarrative_admin_rps`;压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time`、`upstream_connect_time`、`upstream_header_time`、`upstream_response_time`、`upstream_status`、`request_id`。 +- Nginx `/api/` 与 `/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`,upstream keepalive 为 64;`limit_conn` 负责连接 / 并发保护,`limit_req` 负责入口 RPS 快拒绝。当前模板把公开 gallery list 单独放到 `genarrative_gallery_rps`,默认 `rate=5000r/s`、`burst=4096`、`limit_conn=320`;公开详情和普通 API 放到 `genarrative_api_rps`,后台 API 放到 `genarrative_admin_rps`。压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time`、`upstream_connect_time`、`upstream_header_time`、`upstream_response_time`、`upstream_status`、`request_id`。 - 作品列表 K6 脚本一次 iteration 默认请求两个公开接口,因此约 50 HTTP req/s 的目标命令使用 `SCENARIO=spike START_RPS=5 PEAK_RPS=25 HOLD=60s END_RPS=5 DETAIL_RATIO=0 npm run loadtest:k6:works`。 - 作品列表短期继续由 `api-server` / BFF 订阅 SpacetimeDB 公开 read model 后读本地 cache,不让浏览器前端直接订阅完整列表;未来如新增 `public_work_gallery_entry` 等专用公开作品列表 read model,前端只可订阅稳定、低基数、公开的专用投影,禁止订阅 `puzzle_work_profile`、`custom_world_profile` 等玩法源表后自行 join、聚合或判断权限。前端直订阅落地前必须先补齐权限、字段契约、排序 / 分页、埋点和 BFF 回退策略。 -- 50 HTTP req/s 验收目标为 `http_req_failed < 1%`、`p95 < 2s`、`dropped_iterations = 0`,同时压测窗口内 Nginx 无新增 502。 +- 50 HTTP req/s 验收目标为 `http_req_failed < 1%`、`p95 < 2s`、`dropped_iterations = 0`,同时压测窗口内 Nginx 无新增 502。2026-05-19 容器 2C / 2G 连续 10 轮不重启 SpacetimeDB 压测:`PEAK_RPS=2500` 等价约 5000 HTTP req/s,平均实际吞吐约 `4219 HTTP req/s`,10 轮总计 `1,897,357` 个 200、`212,542` 个 429、`0` 个 5xx,200 请求平均 `p95=123ms`、`p99=234ms`;该档会把 SpacetimeDB 容器内存从约 `366MiB` 推到约 `885MiB / 896MiB`,因此当前不要继续抬公开 gallery 入口并发,应优先处理 SpacetimeDB 侧连接 / 订阅 / tracking 写入后的内存高水位。 容器化压测与隔离部署方案单独放在 `deploy/container/`,用于本机或预发模拟 Linux release + Nginx + OTLP Collector 拓扑,不替换当前生产 `systemd + Nginx + Jenkins` 发布路径。当前容器模拟参数按 `genarrative-release` 采样值收口为 2 vCPU / 2 GiB RAM / `nofile=4096` / `worker_connections=768`,并在 compose 里落实到 `spacetimedb cpus=1.0 mem_limit=768m`、`api-server cpus=2.0 mem_limit=1g`、`nginx cpus=0.25 mem_limit=128m`、`otelcol cpus=0.25 mem_limit=128m`、`k6 cpus=0.5 mem_limit=512m`。容器 `api-server` 默认 `GENARRATIVE_API_WORKER_THREADS=4`,只增加 Tokio worker 调度并发,不突破 `api-server cpus=2.0` 的 CPU 配额: From f6292c3ad52ec31aa5268b8ea768c6abd820c6ff Mon Sep 17 00:00:00 2001 From: kdletters <61648117+kdletters@users.noreply.github.com> Date: Tue, 19 May 2026 07:33:44 +0800 Subject: [PATCH 5/6] feat(api-server): default otlp and async tracking outbox --- .hermes/shared-memory/decision-log.md | 16 +++++++ .hermes/shared-memory/pitfalls.md | 16 +++++++ deploy/container/api-server.env.example | 2 +- deploy/env/api-server.env.example | 2 +- ...】server-rs与SpacetimeDB数据契约-2026-05-15.md | 2 +- ...发运维】本地开发验证与生产运维-2026-05-15.md | 12 +++--- .../Jenkinsfile.production-server-provision | 2 +- scripts/loadtest/README.md | 2 +- .../crates/api-server/src/tracking_outbox.rs | 43 +++++++++++++++---- 9 files changed, 78 insertions(+), 19 deletions(-) diff --git a/.hermes/shared-memory/decision-log.md b/.hermes/shared-memory/decision-log.md index 257f8405..8e0f9296 100644 --- a/.hermes/shared-memory/decision-log.md +++ b/.hermes/shared-memory/decision-log.md @@ -16,6 +16,22 @@ --- +## 2026-05-19 tracking outbox 改为 rotate 后异步 flush + +- 背景:普通 route tracking 写入压力上来后,不能让 HTTP 请求线程等待 SpacetimeDB 批量入库。 +- 决策:`api-server` tracking outbox 达到 `BATCH_SIZE` 时立即封存当前 active 文件并切新 active,sealed 文件交给后台 worker 异步 flush;`FLUSH_INTERVAL_MS` 只做长时间未满批的兜底封存;`MAX_BYTES` 只做磁盘保护阈值;成功后删除 sealed,失败保留重试,坏文件隔离为 `corrupt-*`。 +- 影响范围:`api-server` tracking outbox、埋点文档、压测口径和后续排障记忆。 +- 验证方式:HTTP route 请求在 SpacetimeDB 短暂不可用时仍可返回;恢复后 sealed 文件会被批量写入并清理。 +- 关联文档:`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`、`docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md`。 + +## 2026-05-19 OTLP 默认开启但日志本地输出保留 + +- 背景:生产和容器环境需要默认把 OTLP 接到本机 Collector,但压测或排障时也要能显式关闭。 +- 决策:生产与容器 `api-server` env 模板默认 `GENARRATIVE_OTEL_ENABLED=true`;生产 endpoint 用 `http://127.0.0.1:4318`,容器 endpoint 用 `http://otelcol:4318`;`OTEL_EXPORTER_OTLP_ENDPOINT` 只填 Collector HTTP base endpoint,不填 gRPC `4317` 或 Rider 端口;本地日志、Nginx 日志和 `GENARRATIVE_API_LOG` / `RUST_LOG` 仍保留。 +- 影响范围:`deploy/env/api-server.env.example`、`deploy/container/api-server.env.example`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`、`scripts/loadtest/README.md`。 +- 验证方式:检查 env 模板默认值与端点口径;压测若要关闭 OTLP,必须显式设置 `GENARRATIVE_OTEL_ENABLED=false`。 +- 关联文档:`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`、`scripts/run-otelcol.mjs`。 + ## 2026-05-17 容器化方案只作为隔离压测与预发模拟路径 - 背景:Windows 本机直连极高 VU 压测会放大本地连接与发送缓冲行为,和线上 Linux + Nginx + systemd 拓扑不一致;需要一个更接近生产网络层的模拟方案,但不能扰动当前生产发布链路。 diff --git a/.hermes/shared-memory/pitfalls.md b/.hermes/shared-memory/pitfalls.md index f6a9e8d6..1b5e20e8 100644 --- a/.hermes/shared-memory/pitfalls.md +++ b/.hermes/shared-memory/pitfalls.md @@ -22,6 +22,22 @@ - 验证:拼图入口测试仍可通过,且新组件可通过不同页面复用而不需要复制上传卡实现。 - 关联:`src/components/common/CreativeImageInputPanel.tsx`、`src/components/puzzle-agent/PuzzleAgentWorkspace.tsx`。 +## OTLP 端点只填 Collector HTTP base endpoint + +- 现象:生产或容器 env 里把 `OTEL_EXPORTER_OTLP_ENDPOINT` 填成 `4317`、Rider 端口或别的非 HTTP base endpoint 后,api-server 发不出 OTLP,或者链路被错误转发。 +- 原因:api-server 当前走 OTLP HTTP,不是 gRPC;Collector 才是接收和转发边界。 +- 处理:生产模板用 `http://127.0.0.1:4318`,容器模板用 `http://otelcol:4318`;需要关闭时显式设 `GENARRATIVE_OTEL_ENABLED=false`,不要通过改 endpoint 绕开 Collector 语义。 +- 验证:检查 env 模板和运行态配置都指向 Collector HTTP base endpoint,日志仍通过 `journalctl` / 文件日志保留。 +- 关联:`deploy/env/api-server.env.example`、`deploy/container/api-server.env.example`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 + +## tracking outbox 到批量阈值后先封存再异步 flush + +- 现象:route tracking 高峰时如果主请求线程要等 SpacetimeDB 批量入库,接口延迟会被 outbox 写入链路拖长。 +- 原因:outbox 的职责是把普通 HTTP route tracking 从请求线程切走,不能把 flush 结果回写成同步阻塞。 +- 处理:达到 `BATCH_SIZE` 立即封存 active 文件并切新 active,`FLUSH_INTERVAL_MS` 只做兜底封存,后台 worker 异步 flush sealed 文件;成功删文件,失败保留重试,坏文件隔离为 `corrupt-*`,`MAX_BYTES` 只做磁盘保护。 +- 验证:普通 route 请求在 SpacetimeDB 不可用时仍能返回,恢复后 sealed 文件会继续被清理。 +- 关联:`server-rs/crates/api-server/src/tracking_outbox.rs`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 + ## 汪汪声浪入口不要再回到独立配置阶段 - 现象:汪汪声浪入口如果继续切换到独立配置阶段,会和拼图、抓大鹅的创作页内嵌结构不一致,用户会感觉入口跳页。 diff --git a/deploy/container/api-server.env.example b/deploy/container/api-server.env.example index 6c559c0e..a3e0dd33 100644 --- a/deploy/container/api-server.env.example +++ b/deploy/container/api-server.env.example @@ -18,7 +18,7 @@ GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE=500 GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS=1000 GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES=268435456 -GENARRATIVE_OTEL_ENABLED=false +GENARRATIVE_OTEL_ENABLED=true OTEL_SERVICE_NAME=genarrative-api OTEL_EXPORTER_OTLP_ENDPOINT=http://otelcol:4318 OTEL_RESOURCE_ATTRIBUTES=deployment.environment=container,service.namespace=genarrative diff --git a/deploy/env/api-server.env.example b/deploy/env/api-server.env.example index d2c835b9..c7a85bee 100644 --- a/deploy/env/api-server.env.example +++ b/deploy/env/api-server.env.example @@ -16,7 +16,7 @@ GENARRATIVE_TRACKING_OUTBOX_DIR=/var/lib/genarrative/tracking-outbox GENARRATIVE_TRACKING_OUTBOX_BATCH_SIZE=500 GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS=1000 GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES=268435456 -GENARRATIVE_OTEL_ENABLED=false +GENARRATIVE_OTEL_ENABLED=true OTEL_SERVICE_NAME=genarrative-api OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318 OTEL_RESOURCE_ATTRIBUTES=deployment.environment=production,service.namespace=genarrative diff --git a/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md b/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md index decb4f96..60afea3e 100644 --- a/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md +++ b/docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md @@ -602,7 +602,7 @@ npm run check:server-rs-ddd - Rust 结构体:`TrackingEvent` - 源码:`server-rs/crates/spacetime-module/src/runtime/profile.rs` -- 写入:关键业务埋点同步调用单条 procedure;普通 HTTP route tracking 由 `api-server` 本机 outbox 批量调用 `record_tracking_events_and_return`。`event_id` 必须稳定且全局唯一,批量重试时用唯一索引做幂等跳过。 +- 写入:关键业务埋点同步调用单条 procedure;普通 HTTP route tracking 由 `api-server` 本机 outbox 批量调用 `record_tracking_events_and_return`。outbox 到达批量阈值时先封存 active 文件并切新 active,后台 worker 异步 flush sealed 文件,HTTP 请求线程不等待 SpacetimeDB。`FLUSH_INTERVAL_MS` 只负责兜底封存长时间未满批的 active 文件,`MAX_BYTES` 只做磁盘保护阈值。`event_id` 必须稳定且全局唯一,批量重试时用唯一索引做幂等跳过。 ### `treasure_record` diff --git a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md index 80f523da..ef94f558 100644 --- a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md +++ b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md @@ -177,12 +177,12 @@ npm run container:down 容器方案默认暴露 `http://127.0.0.1:18080`,`api-server` 在容器内监听 `0.0.0.0:8082`,Nginx 通过 `api-server:8082` upstream 反代 `/api/` 和 `/admin/api/`。SpacetimeDB 也纳入 compose,容器内由 `spacetimedb:3101` 提供服务,宿主机通过 `http://127.0.0.1:13101` 进行模块发布;Collector 镜像使用 `otel/opentelemetry-collector-contrib:0.151.0`。生产 provision 侧则通过 Jenkins 构建机准备的 `provision-tools/otelcol-contrib` 安装本机 `otelcol-contrib.service`,真实库名、token 和外部服务密钥只写本地 `deploy/container/api-server.env`,不提交 Git。完整拓扑、端口、k6 参数和 OTLP debug exporter 使用方法见 `deploy/container/README.md`。 `npm run container:config` 默认只做 quiet 校验,避免把本地 env 中的 token 展开到终端;确需排查完整 compose 时再传 `-- --print`。 -OpenTelemetry 现阶段可选 OTLP traces / metrics / logs,但本地日志与 Nginx 文件日志仍保留: +OpenTelemetry 现阶段默认开启 OTLP traces / metrics / logs,但本地日志与 Nginx 文件日志仍保留: -- 默认 `GENARRATIVE_OTEL_ENABLED=false`,未开启时 api-server 不依赖 Collector。 -- Collector 使用官方 `otelcol-contrib`,只监听 `127.0.0.1:4317/4318`;本地用 `npm run otel:debug` 启动 debug exporter,用 `npm run otel:rider` 转发到 Rider,再接 Jaeger、Tempo、Prometheus、Grafana 或托管平台。 -- api-server 开启时使用 `OTEL_SERVICE_NAME=genarrative-api`、`OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318`。 -- api-server 当前发 OTLP HTTP,`OTEL_EXPORTER_OTLP_ENDPOINT` 指向 Collector HTTP base endpoint;不要改到 gRPC `4317` 或 Rider 端口,Rider 由 Collector 通过 `RIDER_OTLP_GRPC_ENDPOINT` 转发。 +- 生产与容器 `api-server` env 模板默认 `GENARRATIVE_OTEL_ENABLED=true`;压测、排障或短期要关闭 OTLP 时,必须显式设置 `GENARRATIVE_OTEL_ENABLED=false`。 +- Collector 使用官方 `otelcol-contrib`,安装与启用仍由 `ENABLE_OTELCOL` / provision 控制,只监听 `127.0.0.1:4317/4318`;本地用 `npm run otel:debug` 启动 debug exporter,用 `npm run otel:rider` 转发到 Rider,再接 Jaeger、Tempo、Prometheus、Grafana 或托管平台。 +- api-server 发送 OTLP HTTP 时,生产模板使用 `OTEL_SERVICE_NAME=genarrative-api`、`OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318`,容器模板使用 `OTEL_EXPORTER_OTLP_ENDPOINT=http://otelcol:4318`。 +- `OTEL_EXPORTER_OTLP_ENDPOINT` 必须指向 Collector 的 HTTP base endpoint;不要填 gRPC `4317`,也不要直接填 Rider 端口,Rider 由 Collector 通过 `RIDER_OTLP_GRPC_ENDPOINT` 转发。 - 应用日志仍通过 `journalctl -u genarrative-api.service` 查看,Nginx 日志仍写文件;日志等级继续用 `GENARRATIVE_API_LOG` / `RUST_LOG` 控制,例如 `info,tower_http=info,spacetime_client=info`。 - debug exporter / Rider 转发都会同时接收 traces、metrics 和 logs。 - api-server 会随 metrics 发送进程级指标:`process.memory.usage`、`process.memory.virtual`、`process.cpu.time`、`genarrative.process.cpu.usage_percent`、`process.thread.count`、`genarrative.process.memory.private`;Windows 额外发送 `process.windows.handle.count`,Linux 额外发送 `process.unix.file_descriptor.count`。这些指标只描述当前进程,不携带请求、用户或作品 label。 @@ -252,7 +252,7 @@ GENARRATIVE_TRACKING_OUTBOX_FLUSH_INTERVAL_MS=1000 GENARRATIVE_TRACKING_OUTBOX_MAX_BYTES=268435456 ``` -outbox 采用 NDJSON 文件保存原始事件。达到 `BATCH_SIZE` 或 `FLUSH_INTERVAL_MS` 任一阈值后,当前 active 文件会被原子切换为 sealed 文件并进入批量 flush;SpacetimeDB 批量 procedure 返回成功后删除 sealed 文件,失败则保留文件并重试。`MAX_BYTES` 是磁盘保护阈值,不是 flush 阈值;超过后低价值 route tracking 可以被丢弃并记录日志 / 指标,关键同步事件不进入该丢弃路径。sealed 文件若出现无法解析的坏行,会重命名为 `corrupt-*` 隔离并记录 `genarrative.tracking_outbox.files.corrupt` 指标,避免一个坏文件阻塞后续批量入库。该机制提供至少一次投递语义,依赖 `tracking_event.event_id` 幂等跳过重复事件。 +outbox 采用 NDJSON 文件保存原始事件。达到 `BATCH_SIZE` 时会立刻把当前 active 文件原子封存为 sealed 文件,并马上切到新的 active 继续写入;后台 worker 异步 flush sealed 文件,HTTP 请求线程不等待 SpacetimeDB。`FLUSH_INTERVAL_MS` 只负责兜底封存长时间未满批的 active 文件。SpacetimeDB 批量 procedure 返回成功后删除 sealed 文件,失败则保留文件并重试。`MAX_BYTES` 是磁盘保护阈值,不是 flush 阈值;超过后低价值 route tracking 可以被丢弃并记录日志 / 指标,关键同步事件不进入该丢弃路径。sealed 文件若出现无法解析的坏行,会重命名为 `corrupt-*` 隔离并记录 `genarrative.tracking_outbox.files.corrupt` 指标,避免一个坏文件阻塞后续批量入库。该机制提供至少一次投递语义,依赖 `tracking_event.event_id` 幂等跳过重复事件。 常用检查思路: diff --git a/jenkins/Jenkinsfile.production-server-provision b/jenkins/Jenkinsfile.production-server-provision index 0b8a5e2d..00de7272 100644 --- a/jenkins/Jenkinsfile.production-server-provision +++ b/jenkins/Jenkinsfile.production-server-provision @@ -32,7 +32,7 @@ pipeline { string(name: 'API_PORT', defaultValue: '8082', description: 'api-server 本机监听端口') choice(name: 'NGINX_CONFIG_MODE', choices: ['none', 'production-https', 'development-http'], description: 'Nginx 配置模式;开发服无域名时选 development-http,release 正式入口选 production-https') booleanParam(name: 'ENABLE_SERVICES', defaultValue: true, description: '启用并启动 spacetimedb 与 api-server systemd 服务') - booleanParam(name: 'ENABLE_OTELCOL', defaultValue: true, description: '安装并启用本机 OpenTelemetry Collector;api-server 是否发送 OTLP 仍由环境变量控制') + booleanParam(name: 'ENABLE_OTELCOL', defaultValue: true, description: '安装并启用本机 OpenTelemetry Collector;api-server 模板默认开启 OTLP,如需关闭请在 API_ENV_FILE 中将 GENARRATIVE_OTEL_ENABLED 改为 false') string(name: 'OTELCOL_VERSION', defaultValue: '0.151.0', description: 'otelcol-contrib 版本') } diff --git a/scripts/loadtest/README.md b/scripts/loadtest/README.md index cb2d38f1..2f071e8d 100644 --- a/scripts/loadtest/README.md +++ b/scripts/loadtest/README.md @@ -247,7 +247,7 @@ sudo journalctl -u genarrative-api.service -f sudo journalctl -u spacetimedb.service -f ``` -api-server 的 OpenTelemetry 默认关闭。需要验证 OTLP traces / metrics / logs 时,先在服务器本机启动只监听 `127.0.0.1` 的 `otelcol-contrib` debug exporter: +api-server 的 OpenTelemetry 在生产与容器模板里默认开启。需要临时关闭时,显式把 `GENARRATIVE_OTEL_ENABLED=false`;需要验证 OTLP traces / metrics / logs 时,先在服务器本机启动只监听 `127.0.0.1` 的 `otelcol-contrib` debug exporter: ```bash npm run otel:debug diff --git a/server-rs/crates/api-server/src/tracking_outbox.rs b/server-rs/crates/api-server/src/tracking_outbox.rs index 19a61ed6..cf2b4a97 100644 --- a/server-rs/crates/api-server/src/tracking_outbox.rs +++ b/server-rs/crates/api-server/src/tracking_outbox.rs @@ -11,7 +11,7 @@ use spacetime_client::{SpacetimeClient, SpacetimeClientError}; use tokio::{ fs::{self, File, OpenOptions}, io::{AsyncBufReadExt, AsyncWriteExt, BufReader}, - sync::Mutex, + sync::{Mutex, Notify}, time::sleep, }; use tracing::{debug, warn}; @@ -31,6 +31,7 @@ pub struct TrackingOutbox { max_bytes: u64, spacetime_client: SpacetimeClient, inner: Arc>, + flush_notify: Arc, } struct TrackingOutboxInner { @@ -81,6 +82,7 @@ impl TrackingOutbox { total_bytes, last_sealed_at: Instant::now(), })), + flush_notify: Arc::new(Notify::new()), }; crate::telemetry::update_tracking_outbox_pending_bytes(total_bytes); Some(Arc::new(outbox)) @@ -129,6 +131,7 @@ impl TrackingOutbox { if inner.active_count >= self.batch_size { self.seal_active_locked(&mut inner, "batch_size").await?; + self.flush_notify.notify_one(); } Ok(TrackingOutboxEnqueueOutcome::Enqueued) @@ -137,12 +140,20 @@ impl TrackingOutbox { pub fn spawn_worker(self: Arc) { tokio::spawn(async move { loop { - sleep(self.flush_interval).await; - if let Err(error) = self.seal_active_if_due().await { - warn!(error = %error, "tracking outbox 定时封存 active 文件失败"); - } - if let Err(error) = self.flush_sealed_files_once().await { - warn!(error = %error, "tracking outbox 批量写入 SpacetimeDB 失败,将保留 sealed 文件等待重试"); + tokio::select! { + _ = sleep(self.flush_interval) => { + if let Err(error) = self.seal_active_if_due().await { + warn!(error = %error, "tracking outbox 定时封存 active 文件失败"); + } + if let Err(error) = self.flush_sealed_files_once().await { + warn!(error = %error, "tracking outbox 批量写入 SpacetimeDB 失败,将保留 sealed 文件等待重试"); + } + } + _ = self.flush_notify.notified() => { + if let Err(error) = self.flush_sealed_files_once().await { + warn!(error = %error, "tracking outbox 批量写入 SpacetimeDB 失败,将保留 sealed 文件等待重试"); + } + } } } }); @@ -502,7 +513,7 @@ mod tests { } #[tokio::test] - async fn enqueue_seals_active_file_when_batch_size_reached() { + async fn enqueue_seals_active_file_when_batch_size_reached_and_rotates_active() { let dir = test_dir("batch"); let outbox = test_outbox(dir.clone(), 2, 1024 * 1024); @@ -522,6 +533,22 @@ mod tests { .count(); assert_eq!(sealed_count, 1); + outbox.enqueue(sample_event("event-3")).await.unwrap(); + + let active_contents = std::fs::read_to_string(dir.join(ACTIVE_FILE_NAME)).unwrap(); + assert!(active_contents.contains("event-3")); + let sealed_count_after_rotate = std::fs::read_dir(&dir) + .unwrap() + .filter_map(Result::ok) + .filter(|entry| { + entry + .file_name() + .to_str() + .is_some_and(|name| name.starts_with(SEALED_FILE_PREFIX)) + }) + .count(); + assert_eq!(sealed_count_after_rotate, 1); + let _ = std::fs::remove_dir_all(dir); } From f557bc3f06f29a443bffb6d324d68b0e58a2c2b1 Mon Sep 17 00:00:00 2001 From: kdletters <61648117+kdletters@users.noreply.github.com> Date: Tue, 19 May 2026 08:45:59 +0800 Subject: [PATCH 6/6] feat: add grafana cloud collector switch for container loadtest --- .hermes/shared-memory/decision-log.md | 8 +++++ deploy/container/README.md | 12 +++++++ deploy/container/docker-compose.loadtest.yml | 6 +++- deploy/container/otelcol.grafana.yaml | 36 ++++++++++++++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 deploy/container/otelcol.grafana.yaml diff --git a/.hermes/shared-memory/decision-log.md b/.hermes/shared-memory/decision-log.md index 8e0f9296..68b114af 100644 --- a/.hermes/shared-memory/decision-log.md +++ b/.hermes/shared-memory/decision-log.md @@ -32,6 +32,14 @@ - 验证方式:检查 env 模板默认值与端点口径;压测若要关闭 OTLP,必须显式设置 `GENARRATIVE_OTEL_ENABLED=false`。 - 关联文档:`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`、`scripts/run-otelcol.mjs`。 +## 2026-05-19 容器 collector 可切 Grafana Cloud + +- 背景:容器隔离压测时除了本地 debug exporter,还需要临时把 traces / metrics / logs 转发到 Grafana Cloud 做可视化验证。 +- 决策:`deploy/container/docker-compose.loadtest.yml` 里的 `otelcol` 支持通过 `GENARRATIVE_CONTAINER_OTELCOL_CONFIG=./otelcol.grafana.yaml` 切换配置;`deploy/container/otelcol.grafana.yaml` 同时保留 debug exporter,并通过 `GRAFANA_CLOUD_OTLP_ENDPOINT` 和 `GRAFANA_CLOUD_BASIC_AUTH_HEADER` 转发到 Grafana Cloud。 +- 影响范围:`deploy/container/docker-compose.loadtest.yml`、`deploy/container/otelcol.grafana.yaml`、`deploy/container/README.md`。 +- 验证方式:容器 `otelcol` 启动日志应能看到 OTLP receiver ready,debug exporter 仍可输出本地链路;Grafana Cloud 转发凭据只通过当前 shell 环境变量传入,不写入 Git。 +- 关联文档:`deploy/container/README.md`、`scripts/loadtest/README.md`。 + ## 2026-05-17 容器化方案只作为隔离压测与预发模拟路径 - 背景:Windows 本机直连极高 VU 压测会放大本地连接与发送缓冲行为,和线上 Linux + Nginx + systemd 拓扑不一致;需要一个更接近生产网络层的模拟方案,但不能扰动当前生产发布链路。 diff --git a/deploy/container/README.md b/deploy/container/README.md index 3fa60fdf..b9338457 100644 --- a/deploy/container/README.md +++ b/deploy/container/README.md @@ -162,6 +162,18 @@ npm run container:logs -- otelcol Collector 日志会输出 traces / metrics / logs。接 Rider、Jaeger、Tempo、Prometheus、Grafana 或托管平台时,另建独立 Collector 配置,不直接改生产 systemd 或 Nginx 模板。 +容器内需要临时转发到 Grafana Cloud 时,切换 Collector 配置并从当前 shell 传入 Grafana Cloud 凭据;真实 token 不写入仓库文件: + +```powershell +$env:GENARRATIVE_CONTAINER_OTELCOL_CONFIG="./otelcol.grafana.yaml" +$env:GRAFANA_CLOUD_OTLP_ENDPOINT="https://..." +$env:GRAFANA_CLOUD_BASIC_AUTH_HEADER="Basic ..." +npm run container:up +npm run container:logs -- otelcol +``` + +`deploy/container/otelcol.grafana.yaml` 会同时保留本地 debug exporter,并通过 `otlphttp/grafana` 把 traces / metrics / logs 发到 Grafana Cloud。 + ## 隔离边界 - 不改生产 systemd 单元。 diff --git a/deploy/container/docker-compose.loadtest.yml b/deploy/container/docker-compose.loadtest.yml index c7e00cbc..afac4962 100644 --- a/deploy/container/docker-compose.loadtest.yml +++ b/deploy/container/docker-compose.loadtest.yml @@ -104,8 +104,12 @@ services: command: ["--config=/etc/otelcol/config.yaml"] cpus: "0.25" mem_limit: 128m + environment: + GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-} + GRAFANA_CLOUD_BASIC_AUTH_HEADER: ${GRAFANA_CLOUD_BASIC_AUTH_HEADER:-} + HOSTNAME: ${HOSTNAME:-genarrative-container-loadtest} volumes: - - ./otelcol.yaml:/etc/otelcol/config.yaml:ro + - ${GENARRATIVE_CONTAINER_OTELCOL_CONFIG:-./otelcol.yaml}:/etc/otelcol/config.yaml:ro ports: - "${GENARRATIVE_CONTAINER_OTLP_GRPC_PORT:-4317}:4317" - "${GENARRATIVE_CONTAINER_OTLP_HTTP_PORT:-4318}:4318" diff --git a/deploy/container/otelcol.grafana.yaml b/deploy/container/otelcol.grafana.yaml new file mode 100644 index 00000000..ae0af6f4 --- /dev/null +++ b/deploy/container/otelcol.grafana.yaml @@ -0,0 +1,36 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 5s + send_batch_size: 512 + send_batch_max_size: 1024 + +exporters: + debug: + verbosity: basic + otlp_http/grafana: + endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT} + headers: + Authorization: ${env:GRAFANA_CLOUD_BASIC_AUTH_HEADER} + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug, otlp_http/grafana] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [debug, otlp_http/grafana] + logs: + receivers: [otlp] + processors: [batch] + exporters: [debug, otlp_http/grafana]