feat(api-server): audit external api failures

This commit is contained in:
kdletters
2026-05-21 16:33:13 +08:00
parent 487efff9c4
commit cc23b6020d
19 changed files with 2266 additions and 56 deletions

16
.codegraph/.gitignore vendored Normal file
View File

@@ -0,0 +1,16 @@
# CodeGraph data files
# These are local to each machine and should not be committed
# Database
*.db
*.db-wal
*.db-shm
# Cache
cache/
# Logs
*.log
# Hook markers
.dirty

143
.codegraph/config.json Normal file
View File

@@ -0,0 +1,143 @@
{
"version": 1,
"include": [
"**/*.ts",
"**/*.tsx",
"**/*.js",
"**/*.jsx",
"**/*.py",
"**/*.go",
"**/*.rs",
"**/*.java",
"**/*.c",
"**/*.h",
"**/*.cpp",
"**/*.hpp",
"**/*.cc",
"**/*.cxx",
"**/*.cs",
"**/*.php",
"**/*.rb",
"**/*.swift",
"**/*.kt",
"**/*.kts",
"**/*.dart",
"**/*.svelte",
"**/*.vue",
"**/*.liquid",
"**/*.pas",
"**/*.dpr",
"**/*.dpk",
"**/*.lpr",
"**/*.dfm",
"**/*.fmx",
"**/*.scala",
"**/*.sc"
],
"exclude": [
"**/.git/**",
"**/node_modules/**",
"**/vendor/**",
"**/Pods/**",
"**/dist/**",
"**/build/**",
"**/out/**",
"**/bin/**",
"**/obj/**",
"**/target/**",
"**/*.min.js",
"**/*.bundle.js",
"**/.next/**",
"**/.nuxt/**",
"**/.svelte-kit/**",
"**/.output/**",
"**/.turbo/**",
"**/.cache/**",
"**/.parcel-cache/**",
"**/.vite/**",
"**/.astro/**",
"**/.docusaurus/**",
"**/.gatsby/**",
"**/.webpack/**",
"**/.nx/**",
"**/.yarn/cache/**",
"**/.pnpm-store/**",
"**/storybook-static/**",
"**/.expo/**",
"**/web-build/**",
"**/ios/Pods/**",
"**/ios/build/**",
"**/android/build/**",
"**/android/.gradle/**",
"**/__pycache__/**",
"**/.venv/**",
"**/venv/**",
"**/site-packages/**",
"**/dist-packages/**",
"**/.pytest_cache/**",
"**/.mypy_cache/**",
"**/.ruff_cache/**",
"**/.tox/**",
"**/.nox/**",
"**/*.egg-info/**",
"**/.eggs/**",
"**/go/pkg/mod/**",
"**/target/debug/**",
"**/target/release/**",
"**/.gradle/**",
"**/.m2/**",
"**/generated-sources/**",
"**/.kotlin/**",
"**/.dart_tool/**",
"**/.vs/**",
"**/.nuget/**",
"**/artifacts/**",
"**/publish/**",
"**/cmake-build-*/**",
"**/CMakeFiles/**",
"**/bazel-*/**",
"**/vcpkg_installed/**",
"**/.conan/**",
"**/Debug/**",
"**/Release/**",
"**/x64/**",
"**/.pio/**",
"**/release/**",
"**/*.app/**",
"**/*.asar",
"**/DerivedData/**",
"**/.build/**",
"**/.swiftpm/**",
"**/xcuserdata/**",
"**/Carthage/Build/**",
"**/SourcePackages/**",
"**/__history/**",
"**/__recovery/**",
"**/*.dcu",
"**/.composer/**",
"**/storage/framework/**",
"**/bootstrap/cache/**",
"**/.bundle/**",
"**/tmp/cache/**",
"**/public/assets/**",
"**/public/packs/**",
"**/.yardoc/**",
"**/coverage/**",
"**/htmlcov/**",
"**/.nyc_output/**",
"**/test-results/**",
"**/.coverage/**",
"**/.idea/**",
"**/logs/**",
"**/tmp/**",
"**/temp/**",
"**/_build/**",
"**/docs/_build/**",
"**/site/**"
],
"languages": [],
"frameworks": [],
"maxFileSize": 1048576,
"extractDocstrings": true,
"trackCallSites": true
}

View File

@@ -16,6 +16,23 @@
---
## 2026-05-21 外部 API 失败必须 OTLP 上报并落库
- 背景:图片生成等外部供应商调用失败时,仅返回 502/504 或普通日志无法支持后续按 provider、阶段和重试属性聚合排障。
- 决策:外部 API 调用未成功时,`api-server` 必须同时发送 OTLP 失败观测并写入 `tracking_event`。当前通用 VectorEngine `gpt-image-2-all` 图片生成 / 编辑适配器记录 `external_api_call_failure``scope_kind = module``scope_id = provider``module_key = external-api`metadata 包含 endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount、imageModel 和 rawExcerpt。
- 落库方式:优先复用 tracking outbox 异步批量写入outbox 不可写或因保护阈值拒绝时回退同步直写 SpacetimeDB。不新增 SpacetimeDB 表,不让 reducer 做外部 I/O。
- 影响范围:`server-rs/crates/api-server/src/external_api_audit.rs``server-rs/crates/api-server/src/openai_image_generation.rs``server-rs/crates/api-server/src/telemetry.rs`、tracking outbox、后端架构文档和开发运维文档。
- 验证方式:执行 `cargo test -p api-server external_api_audit --manifest-path server-rs/Cargo.toml -- --nocapture``cargo test -p api-server openai_image_generation --manifest-path server-rs/Cargo.toml -- --nocapture``cargo check -p api-server --manifest-path server-rs/Cargo.toml``npm run check:encoding`
- 关联文档:`docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md``docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`
## 2026-05-21 Nginx 通用 API 入口放行创作参考图请求体
- 背景release 上拼图结果页重绘动作携带参考图 Data URL 时Nginx access log 出现 `413``request_time=0.000``upstream_status=-`,说明请求被反代层默认 1 MiB 上限拦截,未进入 `api-server`
- 决策:发布、开发服和容器 Nginx 模板的通用 `location ~ ^/api(?:/|$)` 统一设置 `client_max_body_size 64m`。该值只作为反代放行上限,具体业务请求体和图片字节上限继续由 `api-server` 路由 `DefaultBodyLimit` 与业务校验控制,不能替代接口级限制。
- 影响范围:`deploy/nginx/genarrative.conf``deploy/nginx/genarrative-dev-http.conf``deploy/container/nginx.conf`、Nginx README、生产运维文档和 release 排障口径。
- 验证方式:目标机 `nginx -T 2>/dev/null | grep client_max_body_size` 应看到 `client_max_body_size 64m;`;大于 1 MiB 的参考图请求不再在 Nginx 层直接 413access log 应出现有效 `upstream_status`
- 关联文档:`deploy/nginx/README.md``docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`
## 2026-05-18 Rust 手写模块入口统一不用 mod.rs

View File

@@ -112,6 +112,17 @@ SpacetimeDB bindings 生成:
npm run spacetime:generate
```
CodeGraph 本地语义索引:
```bash
npm run codegraph:init
npm run codegraph:status
npm run codegraph:sync
npm run codegraph:index
```
`.codegraph/config.json` 可随仓库共享;`.codegraph/codegraph.db`、缓存和日志为本机生成物不提交。Codex CLI / Cursor / Claude Code 等 MCP 客户端配置属于个人环境;需要时由成员本机执行 `codegraph install` 或查看 `codegraph install --print-config codex`,不要提交个人全局配置。
## 常用检查命令
- 后端通用用户行为埋点统一通过 `record_tracking_event_and_return` procedure、`SpacetimeRuntimeClient::record_tracking_event(...)` 与 api-server `tracking` 中间件写入 `tracking_event` / `tracking_daily_stat`后台、RPG、大鱼吃小鱼、Visual Novel、Story、Combat 默认排除;作品级游玩埋点统一使用 `work_play_start`,详细事件清单见 `docs/technical/BACKEND_TRACKING_EVENT_COVERAGE_2026-05-09.md`

View File

@@ -54,6 +54,22 @@
- 验证:`tr '\0' '\n' < /proc/$(systemctl show genarrative-api.service -p MainPID --value)/environ | grep GENARRATIVE_TRACKING_OUTBOX_DIR` 应指向 `/var/lib/genarrative/tracking-outbox`;重启后当前 PID 不再出现 `Permission denied (os error 13)`
- 关联:`scripts/deploy/production-api-deploy.sh``scripts/jenkins-server-provision.sh``docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`
## 外部 API 失败没法追溯先查 external_api_call_failure
- 现象VectorEngine 图片生成 / 编辑接口对前端只表现为 `502` / `504` 或“上游服务请求失败”,但难以区分是请求发送失败、上游 429/5xx、响应解析失败、未返回图片还是下载图片失败。
- 原因:外部 API 失败如果只靠普通日志,不一定能和 OTLP 指标、trace 与 SpacetimeDB 历史查询稳定关联;重启后也容易丢失上下文。
- 处理:先查 OTLP 指标 `genarrative.external_api.failures{provider,failure_stage,status_class,retryable}`,再查 `tracking_event``event_key = 'external_api_call_failure'``metadata_json`。当前通用 VectorEngine `gpt-image-2-all` 适配器会记录 provider、endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount、imageModel 和 rawExcerpt。
- 验证:`SELECT event_id, scope_id AS provider, metadata_json, occurred_at FROM tracking_event WHERE event_key = 'external_api_call_failure' ORDER BY occurred_at DESC LIMIT 50;`;如果查不到同时看 tracking outbox 目录权限和 sealed 文件是否堆积。
- 关联:`server-rs/crates/api-server/src/external_api_audit.rs``server-rs/crates/api-server/src/openai_image_generation.rs``docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md``docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`
## release 创作接口 413 先查 Nginx 请求体上限
- 现象release 上 `POST /api/runtime/puzzle/agent/sessions/{session_id}/actions` 携带参考图 Data URL 时返回 `413 Request Entity Too Large`access log 显示 `request_time=0.000``upstream_status=-`
- 原因Nginx 默认 `client_max_body_size` 只有 1 MiB请求在反代层被拒绝根本没有到达 `api-server`,即使 Rust 路由已通过 `DefaultBodyLimit` 放宽到更大的参考图请求体也不会生效。
- 处理:在 release、development-http 和容器 Nginx 模板的通用 `/api` location 设置 `client_max_body_size 64m`;该值只负责放行到 `api-server`,真实业务上限继续由路由 `DefaultBodyLimit` 和解码后字节校验承担。发布后运行 `nginx -t && nginx -s reload`
- 验证:`nginx -T 2>/dev/null | grep client_max_body_size` 应能看到 `client_max_body_size 64m;`;再次提交大于 1 MiB 的参考图请求时access log 应出现正常 `upstream_status`,不再是 Nginx 直接 413。
- 关联:`deploy/nginx/genarrative.conf``deploy/nginx/genarrative-dev-http.conf``deploy/container/nginx.conf``deploy/nginx/README.md``docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`
## 汪汪声浪入口不要再回到独立配置阶段
- 现象:汪汪声浪入口如果继续切换到独立配置阶段,会和拼图、抓大鹅的创作页内嵌结构不一致,用户会感觉入口跳页。

View File

@@ -170,6 +170,8 @@ http {
location ~ ^/api(?:/|$) {
default_type application/json;
# 中文注释:创作接口会携带参考图 Data URLNginx 只放行到 api-server真实大小限制仍由路由 DefaultBodyLimit 和业务字节校验负责。
client_max_body_size 64m;
limit_conn genarrative_api_conn 64;
limit_req zone=genarrative_api_rps burst=64 nodelay;

View File

@@ -2,6 +2,12 @@
本配置片段由 `scripts/jenkins-server-provision.sh` 在安装 Nginx 站点配置时展开。
## 请求体大小
- 生产、开发服和容器模板都在通用 `location ~ ^/api(?:/|$)` 内设置 `client_max_body_size 64m`
- 该值只用于让携带参考图 Data URL 的创作接口抵达 `api-server`不要把它当作业务上传上限。Rust 路由仍通过 `DefaultBodyLimit` 和解码后字节校验限制具体接口,例如拼图参考图路由只放宽到 12 MiB 请求体,图片字节继续按业务规则拒绝。
- 若线上看到 `413 Request Entity Too Large`,并且 access log 里 `request_time=0.000 upstream_status=-`,通常是 Nginx 没有加载该模板或未 reload先执行 `nginx -T | grep client_max_body_size``nginx -t` 再检查 `api-server`
## gzip
- `deploy/nginx/genarrative.conf``deploy/nginx/genarrative-dev-http.conf` 默认开启 gzip。

View File

@@ -190,6 +190,8 @@ server {
# 临时兼容主站仍在使用的 /api/* HTTP facade前端完成 SpacetimeDB SDK 迁移后删除。
location ~ ^/api(?:/|$) {
default_type application/json;
# 中文注释:创作接口会携带参考图 Data URLNginx 只放行到 api-server真实大小限制仍由路由 DefaultBodyLimit 和业务字节校验负责。
client_max_body_size 64m;
limit_conn genarrative_api_conn 64;
limit_req zone=genarrative_api_rps burst=64 nodelay;

View File

@@ -210,6 +210,8 @@ server {
# 临时兼容主站仍在使用的 /api/* HTTP facade前端完成 SpacetimeDB SDK 迁移后删除。
location ~ ^/api(?:/|$) {
default_type application/json;
# 中文注释:创作接口会携带参考图 Data URLNginx 只放行到 api-server真实大小限制仍由路由 DefaultBodyLimit 和业务字节校验负责。
client_max_body_size 64m;
limit_conn genarrative_api_conn 64;
limit_req zone=genarrative_api_rps burst=64 nodelay;

View File

@@ -158,6 +158,7 @@ npm run check:server-rs-ddd
- Hyper3D / Rodin只保留后端安全代理和旧数据兼容新 Match3D 草稿和批量新增不再生成 GLB。
- 音频:视觉小说专用音频路由保留;拼图和抓大鹅生成入口暂时关闭,通用 `/api/creation/audio/*` 对相关目标返回 `410 Gone`
- OSS私有 generated legacy path 进入浏览器前必须通过 `/api/assets/read-url` 换签;不要裸请求 `/generated-*`
- 外部 API 失败审计:外部供应商调用未成功时,`api-server` 必须发送 OTLP 失败事件并写入 `tracking_event`。当前通用 VectorEngine `gpt-image-2-all` 图片生成 / 编辑适配器在 `request_send``response_body``upstream_status``response_parse``missing_image``image_download` 阶段失败时记录 `external_api_call_failure``scope_kind = module``scope_id = provider``module_key = external-api`metadata 固定包含 provider、endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount 和 imageModel。入库优先复用 tracking outboxoutbox 不可写或保护阈值拒绝时回退同步写 SpacetimeDB不得新增前端兜底或在 SpacetimeDB reducer 内做外部 I/O。
## SpacetimeDB 表目录
@@ -672,6 +673,7 @@ npm run check:server-rs-ddd
- Rust 结构体:`TrackingEvent`
- 源码:`server-rs/crates/spacetime-module/src/runtime/profile.rs`
- 写入:关键业务埋点同步调用单条 procedure普通 HTTP route tracking 由 `api-server` 本机 outbox 批量调用 `record_tracking_events_and_return`。outbox 到达批量阈值时先封存 active 文件并切新 active后台 worker 异步 flush sealed 文件HTTP 请求线程不等待 SpacetimeDB。`FLUSH_INTERVAL_MS` 只负责兜底封存长时间未满批的 active 文件,`MAX_BYTES` 只做磁盘保护阈值。`event_id` 必须稳定且全局唯一,批量重试时用唯一索引做幂等跳过。
- 外部 API 失败:`event_key = external_api_call_failure` 使用同一张表落库;它是供应商失败审计事实,不新增 SpacetimeDB 表,查询时按 `module_key = 'external-api'``scope_kind = module AND scope_id = '<provider>'` 过滤。
### `treasure_record`

View File

@@ -94,6 +94,27 @@ SpacetimeDB bindings
npm run spacetime:generate
```
## CodeGraph 本地代码索引
项目已安装 `@colbymchenry/codegraph` 作为开发期依赖,用于在本地生成语义代码索引,辅助 AI / IDE 做符号搜索、调用关系和影响范围分析。索引目录为 `.codegraph/`,其中 `config.json` 可提交,数据库、缓存和日志由 `.codegraph/.gitignore` 保持本机私有。
首次拉取或需要重建索引时:
```bash
npm install
npm run codegraph:init
```
日常使用:
```bash
npm run codegraph:status
npm run codegraph:sync
npm run codegraph:index
```
若要把 CodeGraph 接到 Codex CLI / Cursor / Claude Code 等 MCP 客户端,按本机 agent 配置执行 `codegraph install` 或参考 `codegraph install --print-config codex` 输出;不要把个人全局 agent 配置、token 或本机绝对路径提交到仓库。Codex CLI 当前没有项目级 MCP 配置,需由使用者在个人 `~/.codex/config.toml` 中配置。
## 后端改动验收
后端代码修改后,按变更范围选择:
@@ -164,7 +185,7 @@ Windows Stdb module 构建流水线运行在 Jenkins `windows` 节点上。该
- Windows 下载阶段如果出现 `curl: (18)` 或响应体截断,流水线会保留同名 `.download` 临时文件并用 `curl -C -` 断点续传;只有完整返回但 SHA256 digest 仍不匹配时才删除临时文件后重新下载。目标 Linux 节点仍只接收 `stash/unstash` 带过去的本地下载件,不回退外网下载。
- Windows 下载阶段如果走代理,在 `Genarrative-Server-Provision` 参数 `PROVISION_DOWNLOAD_PROXY` 填写 Windows Jenkins 节点可访问的 HTTP 代理,例如 `http://127.0.0.1:7890`;不要填写目标 release 机器视角的 `127.0.0.1`,除非代理确实运行在该 Windows 节点本机。Linux 目标机阶段会强制要求使用本地下载件,缺少文件直接失败,不再回退到外网下载。
- `otelcol-contrib.service` 作为可选系统服务加入 provision默认监听 `127.0.0.1:4317/4318` 并使用 `deploy/otelcol/genarrative-debug.yaml`。api-server 是否发送 OTLP 仍由 `GENARRATIVE_OTEL_ENABLED` 控制,服务 unit 见 `deploy/systemd/otelcol-contrib.service`
- Nginx `/api/``/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`upstream keepalive 为 64`limit_conn` 负责连接 / 并发保护,`limit_req` 负责入口 RPS 快拒绝。当前模板把公开 gallery list 单独放到 `genarrative_gallery_rps`,默认 `rate=5000r/s``burst=4096``limit_conn=320`;公开详情和普通 API 放到 `genarrative_api_rps`,后台 API 放到 `genarrative_admin_rps``limit_conn_status 429``limit_req_status 429` 必须在 HTTP 与 HTTPS server 中同时生效;若线上压测看到 `limiting connections by zone "genarrative_api_conn"` 却返回 503优先检查 `nginx -T` 里 HTTPS server 是否缺少这些状态码,以及 `/api/runtime/puzzle/gallery` 是否误落到通用 `location ~ ^/api``limit_conn=64`。压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time``upstream_connect_time``upstream_header_time``upstream_response_time``upstream_status``request_id`
- Nginx `/api/``/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`upstream keepalive 为 64`limit_conn` 负责连接 / 并发保护,`limit_req` 负责入口 RPS 快拒绝。当前模板把公开 gallery list 单独放到 `genarrative_gallery_rps`,默认 `rate=5000r/s``burst=4096``limit_conn=320`;公开详情和普通 API 放到 `genarrative_api_rps`,后台 API 放到 `genarrative_admin_rps`通用 `/api` location 设置 `client_max_body_size 64m`只负责允许拼图、抓大鹅、Hyper3D 等创作接口携带参考图 Data URL 抵达 `api-server`;真实业务上限仍由 Rust 路由 `DefaultBodyLimit` 与解码后字节校验控制。若线上出现 `413 Request Entity Too Large` 且 access log 中 `request_time=0.000``upstream_status=-`,说明请求在 Nginx 层被拦截,先用 `nginx -T | grep client_max_body_size` 检查 release 模板是否已渲染并 reload。`limit_conn_status 429``limit_req_status 429` 必须在 HTTP 与 HTTPS server 中同时生效;若线上压测看到 `limiting connections by zone "genarrative_api_conn"` 却返回 503优先检查 `nginx -T` 里 HTTPS server 是否缺少这些状态码,以及 `/api/runtime/puzzle/gallery` 是否误落到通用 `location ~ ^/api``limit_conn=64`。压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time``upstream_connect_time``upstream_header_time``upstream_response_time``upstream_status``request_id`
- 作品列表 K6 脚本一次 iteration 默认请求两个公开接口,因此约 50 HTTP req/s 的目标命令使用 `SCENARIO=spike START_RPS=5 PEAK_RPS=25 HOLD=60s END_RPS=5 DETAIL_RATIO=0 npm run loadtest:k6:works`
- 作品列表短期继续由 `api-server` / BFF 订阅 SpacetimeDB 公开 read model 后读本地 cache不让浏览器前端直接订阅完整列表未来如新增 `public_work_gallery_entry` 等专用公开作品列表 read model前端只可订阅稳定、低基数、公开的专用投影禁止订阅 `puzzle_work_profile``custom_world_profile` 等玩法源表后自行 join、聚合或判断权限。前端直订阅落地前必须先补齐权限、字段契约、排序 / 分页、埋点和 BFF 回退策略。
- 50 HTTP req/s 验收目标为 `http_req_failed < 1%``p95 < 2s``dropped_iterations = 0`,同时压测窗口内 Nginx 无新增 502。2026-05-19 容器 2C / 2G 连续 10 轮不重启 SpacetimeDB 压测:`PEAK_RPS=2500` 等价约 5000 HTTP req/s平均实际吞吐约 `4219 HTTP req/s`10 轮总计 `1,897,357` 个 200、`212,542` 个 429、`0` 个 5xx200 请求平均 `p95=123ms``p99=234ms`;该档会把 SpacetimeDB 容器内存从约 `366MiB` 推到约 `885MiB / 896MiB`,因此当前不要继续抬公开 gallery 入口并发,应优先处理 SpacetimeDB 侧连接 / 订阅 / tracking 写入后的内存高水位。
@@ -193,6 +214,7 @@ OpenTelemetry 现阶段默认开启 OTLP traces / metrics / logs但本地日
- debug exporter / Rider 转发都会同时接收 traces、metrics 和 logs。
- api-server 会随 metrics 发送进程级指标:`process.memory.usage``process.memory.virtual``process.cpu.time``genarrative.process.cpu.usage_percent``process.thread.count``genarrative.process.memory.private`Windows 额外发送 `process.windows.handle.count`Linux 额外发送 `process.unix.file_descriptor.count`。这些指标只描述当前进程,不携带请求、用户或作品 label。
- HTTP 运行态补充发送 `genarrative.http.server.response_bodies.in_flight``genarrative.http.server.request_permits.available`,后者带低基数 `pool=default|gallery|detail|admin` label用于区分业务 handler / 背压 permit 是否仍被占用;拼图广场热点缓存补充发送 `genarrative.puzzle_gallery.cache.*` 指标,记录 fresh hit、stale hit、未命中、后台刷新开始 / 失败、重建耗时和预序列化 data JSON 字节数。
- 外部 API 失败统一发送 OTLP 并落库。当前 VectorEngine `gpt-image-2-all` 图片生成 / 编辑失败会输出 `外部 API 调用失败` trace/log并记录指标 `genarrative.external_api.failures{provider,failure_stage,status_class,retryable}`;同时写入 `tracking_event``event_key = external_api_call_failure``module_key = external-api``scope_kind = module``scope_id = provider`。排障时先按 provider / failureStage 聚合,再结合 request 日志和上游响应 excerpt 判断是限流、超时、解析失败还是未返回图片。
- SpacetimeDB 观测分为两类procedure / reducer 调用继续用 `genarrative.spacetime.procedure.*`,订阅本地 cache 读使用 `genarrative.spacetime.read.*``read=list_puzzle_gallery` 表示拼图广场当前从 `puzzle_gallery_card_view` 本地 cache 读取,不再每个 HTTP 请求调用 `list_puzzle_gallery` procedure。
- 本地 Windows 直连压测的内存高水位要结合 K6 VU / 连接数解释。250 RPS 下过高 `PREALLOCATED_VUS` 可能让 300 个本地 Established 连接把 `api-server` private memory 瞬时推到 GB 级,且 `/healthz` 小响应也能复现;若压测结束后回落、`response_bodies.in_flight` 和背压 permit 未显示业务积压,应优先按连接 / 发送链路高水位处理,而不是判断为 SpacetimeDB 或 JSON 缓存泄漏。
- Rider 的 Logs 面板只展示 log event 自身字段,不会自动展开父 span 的全部 attributes请求完成日志会直接带 `request_id``http.request.method``http.route``url.scheme``url.path``http.response.status_code``status_class``latency_ms``slow_request`,完整链路继续到 Traces 面板按 trace/span 查看。
@@ -248,6 +270,16 @@ cargo test -p platform-auth --manifest-path server-rs/Cargo.toml aliyun_send_sms
个人任务首版 scope 仅支持 `user`。后台、RPG、大鱼吃小鱼、Visual Novel、Story、Combat 等特定链路按 tracking 中间件排除规则处理;作品游玩统一使用 `work_play_start`
外部 API 失败审计复用 `tracking_event`,不新增表。失败事件优先写入本机 tracking outbox再由后台 worker 批量落库;如果 outbox 因权限、磁盘或保护阈值不可写,会回退同步直写 SpacetimeDB。`metadata_json` 包含 endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount、imageModel 和 rawExcerpt。常用查询
```sql
SELECT event_id, scope_id AS provider, metadata_json, occurred_at
FROM tracking_event
WHERE event_key = 'external_api_call_failure'
ORDER BY occurred_at DESC
LIMIT 50;
```
tracking outbox 默认配置:
```env

1061
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -57,7 +57,11 @@
"check:data": "node scripts/run-tsx.cjs scripts/validate-content.ts",
"check:overrides": "node scripts/run-tsx.cjs scripts/validate-overrides.ts",
"check:smoke": "node scripts/run-tsx.cjs scripts/smoke-content.ts",
"check:content": "npm run check:data && npm run check:overrides && npm run check:smoke"
"check:content": "npm run check:data && npm run check:overrides && npm run check:smoke",
"codegraph:init": "codegraph init -i .",
"codegraph:index": "codegraph index .",
"codegraph:sync": "codegraph sync .",
"codegraph:status": "codegraph status ."
},
"dependencies": {
"@tailwindcss/vite": "^4.1.14",
@@ -73,6 +77,7 @@
"vite": "^6.2.0"
},
"devDependencies": {
"@colbymchenry/codegraph": "^0.8.0",
"@testing-library/react": "^16.3.2",
"@testing-library/user-event": "^14.6.1",
"@types/node": "^22.14.0",

View File

@@ -1049,6 +1049,7 @@ mod tests {
base_url: "https://vector.example".to_string(),
api_key: "secret".to_string(),
request_timeout_ms: 180_000,
external_api_audit_state: None,
});
assert_eq!(

View File

@@ -0,0 +1,372 @@
use axum::http::StatusCode;
use module_runtime::RuntimeTrackingScopeKind;
use serde_json::{Value, json};
use time::OffsetDateTime;
use uuid::Uuid;
use crate::{state::AppState, tracking::TrackingEventDraft};
pub(crate) const EXTERNAL_API_FAILURE_EVENT_KEY: &str = "external_api_call_failure";
pub(crate) const EXTERNAL_API_AUDIT_MODULE_KEY: &str = "external-api";
#[derive(Clone, Debug)]
pub(crate) struct ExternalApiFailureDraft {
pub(crate) provider: &'static str,
pub(crate) endpoint: String,
pub(crate) operation: String,
pub(crate) failure_stage: &'static str,
pub(crate) status_code: Option<u16>,
pub(crate) status_class: Option<&'static str>,
pub(crate) timeout: bool,
pub(crate) retryable: bool,
pub(crate) error_message: String,
pub(crate) error_source: Option<String>,
pub(crate) raw_excerpt: Option<String>,
pub(crate) latency_ms: Option<u64>,
pub(crate) prompt_chars: Option<usize>,
pub(crate) reference_image_count: Option<usize>,
pub(crate) image_model: Option<&'static str>,
}
impl ExternalApiFailureDraft {
pub(crate) fn new(
provider: &'static str,
endpoint: impl Into<String>,
operation: impl Into<String>,
failure_stage: &'static str,
error_message: impl Into<String>,
) -> Self {
Self {
provider,
endpoint: endpoint.into(),
operation: operation.into(),
failure_stage,
status_code: None,
status_class: None,
timeout: false,
retryable: false,
error_message: error_message.into(),
error_source: None,
raw_excerpt: None,
latency_ms: None,
prompt_chars: None,
reference_image_count: None,
image_model: None,
}
}
pub(crate) fn with_status_code(mut self, status_code: Option<u16>) -> Self {
self.status_code = status_code;
self
}
pub(crate) fn with_optional_status_class(mut self, status_class: Option<&'static str>) -> Self {
self.status_class = status_class;
self
}
pub(crate) fn with_timeout(mut self, timeout: bool) -> Self {
self.timeout = timeout;
self
}
pub(crate) fn with_retryable(mut self, retryable: bool) -> Self {
self.retryable = retryable;
self
}
pub(crate) fn with_error_source(mut self, error_source: Option<String>) -> Self {
self.error_source = error_source;
self
}
pub(crate) fn with_raw_excerpt(mut self, raw_excerpt: Option<String>) -> Self {
self.raw_excerpt = raw_excerpt;
self
}
pub(crate) fn with_latency_ms(mut self, latency_ms: Option<u64>) -> Self {
self.latency_ms = latency_ms;
self
}
pub(crate) fn with_prompt_chars(mut self, prompt_chars: Option<usize>) -> Self {
self.prompt_chars = prompt_chars;
self
}
pub(crate) fn with_reference_image_count(
mut self,
reference_image_count: Option<usize>,
) -> Self {
self.reference_image_count = reference_image_count;
self
}
pub(crate) fn with_image_model(mut self, image_model: Option<&'static str>) -> Self {
self.image_model = image_model;
self
}
}
/// 中文注释下载图片、OSS 读写等非标准 HTTP 状态统一显式归类,避免 OTLP 低基数 label 误落到 `transport`。
pub(crate) fn app_error_status_class(status_code: StatusCode) -> &'static str {
status_class(Some(status_code.as_u16()))
}
/// 中文注释:外部供应商失败同时进入 OTLP 和 tracking_event失败审计不能反向阻断主业务错误返回。
pub(crate) async fn record_external_api_failure(state: &AppState, draft: ExternalApiFailureDraft) {
record_external_api_failure_otlp(&draft);
let tracking_event = build_external_api_failure_tracking_draft(&draft);
if let Some(outbox) = state.tracking_outbox() {
match outbox
.enqueue(crate::tracking::build_tracking_event_input(
tracking_event.clone(),
))
.await
{
Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Enqueued) => {}
Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Dropped { reason }) => {
tracing::warn!(
provider = draft.provider,
endpoint = %draft.endpoint,
operation = %draft.operation,
failure_stage = draft.failure_stage,
reason,
"外部 API 失败审计写入 outbox 被保护阈值拒绝,回退同步直写 SpacetimeDB"
);
crate::tracking::record_tracking_event_after_success(
state,
&audit_request_context(),
tracking_event,
)
.await;
}
Err(error) => {
tracing::warn!(
provider = draft.provider,
endpoint = %draft.endpoint,
operation = %draft.operation,
failure_stage = draft.failure_stage,
error = %error,
"外部 API 失败审计写入 outbox 失败,回退同步直写 SpacetimeDB"
);
crate::tracking::record_tracking_event_after_success(
state,
&audit_request_context(),
tracking_event,
)
.await;
}
}
return;
}
crate::tracking::record_tracking_event_after_success(
state,
&audit_request_context(),
tracking_event,
)
.await;
}
pub(crate) fn build_external_api_failure_tracking_draft(
failure: &ExternalApiFailureDraft,
) -> TrackingEventDraft {
let mut draft = TrackingEventDraft::new(
EXTERNAL_API_FAILURE_EVENT_KEY,
EXTERNAL_API_AUDIT_MODULE_KEY,
);
draft.scope_kind = RuntimeTrackingScopeKind::Module;
draft.scope_id = failure.provider.to_string();
draft.metadata = build_external_api_failure_metadata(failure);
draft
}
fn build_external_api_failure_metadata(failure: &ExternalApiFailureDraft) -> Value {
let mut metadata = json!({
"provider": failure.provider,
"endpoint": failure.endpoint,
"operation": failure.operation,
"failureStage": failure.failure_stage,
"statusCode": failure.status_code,
"statusClass": failure.status_class.unwrap_or_else(|| status_class(failure.status_code)),
"timeout": failure.timeout,
"retryable": failure.retryable,
"errorMessage": truncate_field(failure.error_message.as_str(), 1_000),
"occurredAt": current_utc_iso_text(),
});
if let Some(latency_ms) = failure.latency_ms {
metadata["latencyMs"] = json!(latency_ms);
}
if let Some(prompt_chars) = failure.prompt_chars {
metadata["promptChars"] = json!(prompt_chars);
}
if let Some(reference_image_count) = failure.reference_image_count {
metadata["referenceImageCount"] = json!(reference_image_count);
}
if let Some(image_model) = failure.image_model {
metadata["imageModel"] = json!(image_model);
}
if let Some(source) = failure
.error_source
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
{
metadata["errorSource"] = json!(truncate_field(source, 1_000));
}
if let Some(excerpt) = failure
.raw_excerpt
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
{
metadata["rawExcerpt"] = json!(truncate_field(excerpt, 800));
}
metadata
}
pub(crate) fn is_retryable_external_api_failure(
status_code: Option<u16>,
timeout: bool,
connect: bool,
) -> bool {
timeout
|| connect
|| status_code.is_some_and(|status| {
status == StatusCode::TOO_MANY_REQUESTS.as_u16()
|| status == StatusCode::REQUEST_TIMEOUT.as_u16()
|| status >= 500
})
}
fn record_external_api_failure_otlp(failure: &ExternalApiFailureDraft) {
crate::telemetry::record_external_api_failure(
failure.provider,
failure.failure_stage,
failure
.status_class
.unwrap_or_else(|| status_class(failure.status_code)),
failure.retryable,
);
tracing::error!(
provider = failure.provider,
endpoint = %failure.endpoint,
operation = %failure.operation,
failure_stage = failure.failure_stage,
status_code = failure.status_code,
status_class = failure.status_class.unwrap_or_else(|| status_class(failure.status_code)),
timeout = failure.timeout,
retryable = failure.retryable,
latency_ms = failure.latency_ms,
prompt_chars = failure.prompt_chars,
reference_image_count = failure.reference_image_count,
image_model = failure.image_model,
error = %failure.error_message,
"外部 API 调用失败"
);
}
fn status_class(status_code: Option<u16>) -> &'static str {
match status_code {
Some(100..=199) => "1xx",
Some(200..=299) => "2xx",
Some(300..=399) => "3xx",
Some(400..=499) => "4xx",
Some(500..=599) => "5xx",
Some(_) => "unknown",
None => "transport",
}
}
fn audit_request_context() -> crate::request_context::RequestContext {
crate::request_context::RequestContext::new(
format!("external-api-audit-{}", Uuid::new_v4()),
"external-api audit".to_string(),
std::time::Duration::ZERO,
false,
)
}
fn truncate_field(value: &str, max_chars: usize) -> String {
value.chars().take(max_chars).collect()
}
fn current_utc_iso_text() -> String {
shared_kernel::format_rfc3339(OffsetDateTime::now_utc())
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
}
#[cfg(test)]
mod tests {
use serde_json::Value;
use super::*;
#[test]
fn external_api_failure_tracking_draft_uses_module_scope_and_safe_metadata() {
let draft = build_external_api_failure_tracking_draft(
&ExternalApiFailureDraft::new(
"vector-engine",
"https://vector.example/v1/images/generations",
"拼图 UI 背景图生成失败",
"upstream_status",
"上游 429",
)
.with_status_code(Some(429))
.with_retryable(true)
.with_latency_ms(Some(1234))
.with_prompt_chars(Some(88))
.with_reference_image_count(Some(2))
.with_image_model(Some("gpt-image-2-all")),
);
assert_eq!(draft.event_key, EXTERNAL_API_FAILURE_EVENT_KEY);
assert_eq!(draft.scope_kind, RuntimeTrackingScopeKind::Module);
assert_eq!(draft.scope_id, "vector-engine");
assert_eq!(draft.module_key, Some(EXTERNAL_API_AUDIT_MODULE_KEY));
let metadata = draft.metadata;
assert_eq!(metadata["provider"], "vector-engine");
assert_eq!(metadata["statusCode"], 429);
assert_eq!(metadata["statusClass"], "4xx");
assert_eq!(metadata["retryable"], true);
assert_eq!(metadata["latencyMs"], 1234);
assert_eq!(metadata["promptChars"], 88);
assert_eq!(metadata["referenceImageCount"], 2);
assert_eq!(metadata["imageModel"], "gpt-image-2-all");
assert!(matches!(metadata["occurredAt"], Value::String(_)));
}
#[test]
fn retryable_classification_keeps_transport_and_overload_failures_actionable() {
assert!(is_retryable_external_api_failure(None, true, false));
assert!(is_retryable_external_api_failure(None, false, true));
assert!(is_retryable_external_api_failure(Some(429), false, false));
assert!(is_retryable_external_api_failure(Some(502), false, false));
assert!(!is_retryable_external_api_failure(Some(400), false, false));
}
#[test]
fn app_error_status_class_can_override_successful_upstream_status() {
let draft = build_external_api_failure_tracking_draft(
&ExternalApiFailureDraft::new(
"vector-engine",
"https://cdn.example/generated.png",
"下载生成图片",
"image_download",
"下载生成图片失败",
)
.with_status_code(Some(200))
.with_optional_status_class(Some(app_error_status_class(StatusCode::BAD_GATEWAY))),
);
assert_eq!(draft.metadata["statusCode"], 200);
assert_eq!(draft.metadata["statusClass"], "5xx");
}
}

View File

@@ -39,6 +39,7 @@ mod custom_world_rpg_draft_prompts;
mod edutainment_baby_drawing;
mod edutainment_baby_object;
mod error_middleware;
mod external_api_audit;
pub(crate) mod generated_asset_sheets;
mod generated_image_assets;
mod health;

View File

@@ -1,21 +1,44 @@
use std::time::Duration;
use std::{error::Error, time::Duration};
use axum::http::StatusCode;
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
use reqwest::header;
use serde_json::{Map, Value, json};
use crate::{http_error::AppError, state::AppState};
use crate::{
external_api_audit::{
ExternalApiFailureDraft, app_error_status_class, is_retryable_external_api_failure,
record_external_api_failure,
},
http_error::AppError,
state::AppState,
};
pub(crate) const GPT_IMAGE_2_MODEL: &str = "gpt-image-2";
pub(crate) const VECTOR_ENGINE_GPT_IMAGE_2_MODEL: &str = "gpt-image-2-all";
const VECTOR_ENGINE_PROVIDER: &str = "vector-engine";
#[derive(Clone, Debug)]
#[derive(Clone)]
pub(crate) struct OpenAiImageSettings {
pub base_url: String,
pub api_key: String,
pub request_timeout_ms: u64,
pub external_api_audit_state: Option<AppState>,
}
impl std::fmt::Debug for OpenAiImageSettings {
fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
formatter
.debug_struct("OpenAiImageSettings")
.field("base_url", &self.base_url)
.field("api_key", &"<redacted>")
.field("request_timeout_ms", &self.request_timeout_ms)
.field(
"external_api_audit_enabled",
&self.external_api_audit_state.is_some(),
)
.finish()
}
}
#[derive(Clone, Debug)]
@@ -74,6 +97,7 @@ pub(crate) fn require_openai_image_settings(
base_url: base_url.to_string(),
api_key: api_key.to_string(),
request_timeout_ms: state.config.vector_engine_image_request_timeout_ms.max(1),
external_api_audit_state: Some(state.clone()),
})
}
@@ -103,15 +127,18 @@ pub(crate) async fn create_openai_image_generation(
reference_images: &[String],
failure_context: &str,
) -> Result<OpenAiGeneratedImages, AppError> {
let request_url = vector_engine_images_generation_url(settings);
let normalized_size = normalize_image_size(size);
let request_body = build_openai_image_request_body(
prompt,
negative_prompt,
size,
normalized_size.as_str(),
candidate_count,
reference_images,
);
let response = http_client
.post(vector_engine_images_generation_url(settings))
let started_at = std::time::Instant::now();
let response = match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
@@ -121,16 +148,106 @@ pub(crate) async fn create_openai_image_generation(
.json(&request_body)
.send()
.await
.map_err(|error| {
map_openai_image_request_error(format!(
"{failure_context}:创建图片生成任务失败:{error}"
))
})?;
{
Ok(response) => response,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:创建图片生成任务失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"request_send",
None,
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:创建图片生成任务失败").as_str(),
request_url.as_str(),
error,
));
}
};
let response_status = response.status();
let response_text = response.text().await.map_err(|error| {
map_openai_image_request_error(format!("{failure_context}:读取图片生成响应失败:{error}"))
})?;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
status = response_status.as_u16(),
prompt_chars = prompt.chars().count(),
size = %normalized_size,
reference_image_count = reference_images.len(),
elapsed_ms = started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片生成 HTTP 返回"
);
let response_text = match response.text().await {
Ok(response_text) => response_text,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:读取图片生成响应失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_body",
Some(response_status.as_u16()),
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:读取图片生成响应失败").as_str(),
request_url.as_str(),
error,
));
}
};
if !response_status.is_success() {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"upstream_status",
Some(response_status.as_u16()),
None,
false,
false,
parse_api_error_message(response_text.as_str(), failure_context).as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(map_openai_image_upstream_error(
response_status.as_u16(),
response_text.as_str(),
@@ -138,26 +255,114 @@ pub(crate) async fn create_openai_image_generation(
));
}
let response_json = parse_json_payload(response_text.as_str(), failure_context)?;
let response_json = match parse_json_payload(response_text.as_str(), failure_context) {
Ok(response_json) => response_json,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_parse",
Some(response_status.as_u16()),
None,
false,
false,
error.body_text().as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(error);
}
};
let generation_id = extract_generation_id(&response_json.payload)
.unwrap_or_else(|| format!("vector-engine-{}", current_utc_micros()));
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
let image_urls = extract_image_urls(&response_json.payload);
if !image_urls.is_empty() {
let mut generated =
download_images_from_urls(http_client, generation_id, image_urls, candidate_count)
.await?;
let download_started_at = std::time::Instant::now();
let mut generated = match download_images_from_urls(
http_client,
generation_id,
image_urls,
candidate_count,
)
.await
{
Ok(generated) => generated,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"image_download",
Some(response_status.as_u16()),
Some(app_error_status_class(error.status_code())),
false,
false,
error.body_text().as_str(),
None,
None,
Some(download_started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(error);
}
};
generated.actual_prompt = actual_prompt;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
image_count = generated.images.len(),
elapsed_ms = download_started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片下载完成"
);
return Ok(generated);
}
let b64_images = extract_b64_images(&response_json.payload);
if !b64_images.is_empty() {
let mut generated = images_from_base64(generation_id, b64_images, candidate_count);
generated.actual_prompt = actual_prompt;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
image_count = generated.images.len(),
failure_context,
"VectorEngine 图片 base64 解码完成"
);
return Ok(generated);
}
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"missing_image",
Some(response_status.as_u16()),
None,
false,
false,
format!("{failure_context}VectorEngine 未返回图片地址").as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
Err(
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
"provider": VECTOR_ENGINE_PROVIDER,
@@ -176,6 +381,8 @@ pub(crate) async fn create_openai_image_edit(
failure_context: &str,
) -> Result<OpenAiGeneratedImages, AppError> {
let task_id = format!("vector-engine-edit-{}", current_utc_micros());
let request_url = vector_engine_images_edit_url(settings);
let normalized_size = normalize_image_size(size);
let image_part = reqwest::multipart::Part::bytes(reference_image.bytes.clone())
.file_name(reference_image.file_name.clone())
.mime_str(reference_image.mime_type.as_str())
@@ -190,9 +397,10 @@ pub(crate) async fn create_openai_image_edit(
build_prompt_with_negative(prompt, negative_prompt),
)
.text("n", "1")
.text("size", normalize_image_size(size));
let response = http_client
.post(vector_engine_images_edit_url(settings).as_str())
.text("size", normalized_size.clone());
let started_at = std::time::Instant::now();
let response = match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
@@ -201,16 +409,106 @@ pub(crate) async fn create_openai_image_edit(
.multipart(form)
.send()
.await
.map_err(|error| {
map_openai_image_request_error(format!(
"{failure_context}:创建图片编辑任务失败:{error}"
))
})?;
{
Ok(response) => response,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:创建图片编辑任务失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"request_send",
None,
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:创建图片编辑任务失败").as_str(),
request_url.as_str(),
error,
));
}
};
let response_status = response.status();
let response_text = response.text().await.map_err(|error| {
map_openai_image_request_error(format!("{failure_context}:读取图片编辑响应失败:{error}"))
})?;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
status = response_status.as_u16(),
prompt_chars = prompt.chars().count(),
size = %normalized_size,
reference_image_count = 1usize,
elapsed_ms = started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片编辑 HTTP 返回"
);
let response_text = match response.text().await {
Ok(response_text) => response_text,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:读取图片编辑响应失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_body",
Some(response_status.as_u16()),
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:读取图片编辑响应失败").as_str(),
request_url.as_str(),
error,
));
}
};
if !response_status.is_success() {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"upstream_status",
Some(response_status.as_u16()),
None,
false,
false,
parse_api_error_message(response_text.as_str(), failure_context).as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(map_openai_image_upstream_error(
response_status.as_u16(),
response_text.as_str(),
@@ -218,12 +516,62 @@ pub(crate) async fn create_openai_image_edit(
));
}
let response_json = parse_json_payload(response_text.as_str(), failure_context)?;
let response_json = match parse_json_payload(response_text.as_str(), failure_context) {
Ok(response_json) => response_json,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_parse",
Some(response_status.as_u16()),
None,
false,
false,
error.body_text().as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(error);
}
};
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
let image_urls = extract_image_urls(&response_json.payload);
if !image_urls.is_empty() {
let mut generated = download_images_from_urls(http_client, task_id, image_urls, 1).await?;
let download_started_at = std::time::Instant::now();
let mut generated =
match download_images_from_urls(http_client, task_id, image_urls, 1).await {
Ok(generated) => generated,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"image_download",
Some(response_status.as_u16()),
Some(app_error_status_class(error.status_code())),
false,
false,
error.body_text().as_str(),
None,
None,
Some(download_started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(error);
}
};
generated.actual_prompt = actual_prompt;
return Ok(generated);
}
@@ -234,6 +582,25 @@ pub(crate) async fn create_openai_image_edit(
return Ok(generated);
}
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"missing_image",
Some(response_status.as_u16()),
None,
false,
false,
format!("{failure_context}VectorEngine 未返回编辑图片").as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
Err(
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
"provider": VECTOR_ENGINE_PROVIDER,
@@ -402,6 +769,44 @@ fn map_openai_image_request_error(message: String) -> AppError {
}))
}
fn map_openai_image_reqwest_error(
context: &str,
request_url: &str,
error: reqwest::Error,
) -> AppError {
let is_timeout = error.is_timeout();
let is_connect = error.is_connect();
let source = error.source().map(ToString::to_string).unwrap_or_default();
let message = format!("{context}{error}");
let status = if is_timeout {
StatusCode::GATEWAY_TIMEOUT
} else {
StatusCode::BAD_GATEWAY
};
tracing::warn!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
timeout = is_timeout,
connect = is_connect,
request = error.is_request(),
body = error.is_body(),
source = %source,
message = %message,
"VectorEngine 图片请求发送失败"
);
AppError::from_status(status).with_details(json!({
"provider": VECTOR_ENGINE_PROVIDER,
"message": message,
"endpoint": request_url,
"timeout": is_timeout,
"connect": is_connect,
"request": error.is_request(),
"body": error.is_body(),
"source": source,
}))
}
fn map_openai_image_upstream_error(
upstream_status: u16,
raw_text: &str,
@@ -423,6 +828,53 @@ fn map_openai_image_upstream_error(
}))
}
async fn record_openai_image_failure_if_configured(
settings: &OpenAiImageSettings,
draft: ExternalApiFailureDraft,
) {
if let Some(state) = settings.external_api_audit_state.as_ref() {
record_external_api_failure(state, draft).await;
}
}
fn build_openai_image_failure_audit_draft(
request_url: &str,
failure_context: &str,
failure_stage: &'static str,
status_code: Option<u16>,
status_class: Option<&'static str>,
timeout: bool,
connect: bool,
error_message: &str,
error_source: Option<String>,
raw_excerpt: Option<String>,
latency_ms: Option<u64>,
prompt_chars: Option<usize>,
reference_image_count: Option<usize>,
) -> ExternalApiFailureDraft {
ExternalApiFailureDraft::new(
VECTOR_ENGINE_PROVIDER,
request_url.to_string(),
failure_context.to_string(),
failure_stage,
error_message.to_string(),
)
.with_status_code(status_code)
.with_optional_status_class(status_class)
.with_timeout(timeout)
.with_retryable(is_retryable_external_api_failure(
status_code,
timeout,
connect,
))
.with_error_source(error_source)
.with_raw_excerpt(raw_excerpt)
.with_latency_ms(latency_ms)
.with_prompt_chars(prompt_chars)
.with_reference_image_count(reference_image_count)
.with_image_model(Some(VECTOR_ENGINE_GPT_IMAGE_2_MODEL))
}
fn parse_api_error_message(raw_text: &str, fallback_message: &str) -> String {
if raw_text.trim().is_empty() {
return fallback_message.to_string();
@@ -629,11 +1081,13 @@ mod tests {
base_url: "https://vector.example".to_string(),
api_key: "test-key".to_string(),
request_timeout_ms: 1_000_000,
external_api_audit_state: None,
};
let v1_settings = OpenAiImageSettings {
base_url: "https://vector.example/v1".to_string(),
api_key: "test-key".to_string(),
request_timeout_ms: 1_000_000,
external_api_audit_state: None,
};
assert_eq!(
@@ -658,4 +1112,41 @@ mod tests {
assert_eq!(images.images[0].mime_type, "image/png");
assert_eq!(images.images[0].extension, "png");
}
#[test]
fn vector_engine_upstream_failure_builds_tracking_ready_audit_event() {
let audit = build_openai_image_failure_audit_draft(
"https://vector.example/v1/images/generations",
"拼图 UI 背景图生成失败",
"upstream_status",
Some(429),
None,
false,
false,
"上游限流",
None,
Some("{\"error\":\"rate limited\"}".to_string()),
Some(321),
Some(42),
Some(1),
);
let tracking = crate::external_api_audit::build_external_api_failure_tracking_draft(&audit);
assert_eq!(
tracking.event_key,
crate::external_api_audit::EXTERNAL_API_FAILURE_EVENT_KEY
);
assert_eq!(tracking.scope_id, VECTOR_ENGINE_PROVIDER);
assert_eq!(tracking.metadata["provider"], VECTOR_ENGINE_PROVIDER);
assert_eq!(tracking.metadata["statusCode"], 429);
assert_eq!(tracking.metadata["statusClass"], "4xx");
assert_eq!(tracking.metadata["failureStage"], "upstream_status");
assert_eq!(tracking.metadata["retryable"], true);
assert_eq!(tracking.metadata["promptChars"], 42);
assert_eq!(tracking.metadata["referenceImageCount"], 1);
assert_eq!(
tracking.metadata["imageModel"],
VECTOR_ENGINE_GPT_IMAGE_2_MODEL
);
}
}

View File

@@ -172,6 +172,23 @@ pub(crate) fn update_tracking_outbox_pending_files(files: usize) {
TRACKING_OUTBOX_PENDING_FILES.store(files.min(i64::MAX as usize) as i64, Ordering::Relaxed);
}
pub(crate) fn record_external_api_failure(
provider: &'static str,
failure_stage: &'static str,
status_class: &'static str,
retryable: bool,
) {
external_api_metrics().failures.add(
1,
&[
KeyValue::new("provider", provider),
KeyValue::new("failure_stage", failure_stage),
KeyValue::new("status_class", status_class),
KeyValue::new("retryable", retryable),
],
);
}
fn track_response_body_in_flight(response: Response<Body>) -> Response<Body> {
response.map(|body| {
HTTP_RESPONSE_BODY_IN_FLIGHT.fetch_add(1, Ordering::Relaxed);
@@ -211,6 +228,10 @@ struct TrackingOutboxMetrics {
flushed_bytes: Counter<u64>,
}
struct ExternalApiMetrics {
failures: Counter<u64>,
}
struct HttpRequestPermitsAvailableGauges {
default: Arc<AtomicI64>,
gallery: Arc<AtomicI64>,
@@ -359,6 +380,21 @@ fn tracking_outbox_metrics() -> &'static TrackingOutboxMetrics {
})
}
fn external_api_metrics() -> &'static ExternalApiMetrics {
static METRICS: std::sync::OnceLock<ExternalApiMetrics> = std::sync::OnceLock::new();
METRICS.get_or_init(|| {
let meter = global::meter("genarrative-api");
ExternalApiMetrics {
failures: meter
.u64_counter("genarrative.external_api.failures")
.with_description(
"External API call failures grouped by provider and failure stage",
)
.build(),
}
})
}
fn register_http_request_permits_available_metric() -> HttpRequestPermitsAvailableGauges {
let gauges = HttpRequestPermitsAvailableGauges::new();
let meter = global::meter("genarrative-api");

View File

@@ -584,6 +584,26 @@ async fn record_route_tracking_event_via_outbox_after_success(
record_tracking_event_input_after_success(state, request_context, event).await;
}
pub(crate) fn build_tracking_event_input(
draft: TrackingEventDraft,
) -> module_runtime::RuntimeTrackingEventInput {
let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000;
let event_id = build_tracking_event_id(&draft, occurred_at_micros);
module_runtime::RuntimeTrackingEventInput {
event_id,
event_key: draft.event_key.to_string(),
scope_kind: draft.scope_kind,
scope_id: draft.scope_id,
user_id: draft.user_id,
owner_user_id: draft.owner_user_id,
profile_id: draft.profile_id,
module_key: draft.module_key.map(str::to_string),
metadata_json: draft.metadata.to_string(),
occurred_at_micros: occurred_at_micros as i64,
}
}
async fn record_tracking_event_input_after_success(
state: &AppState,
request_context: &RequestContext,
@@ -642,26 +662,6 @@ async fn record_tracking_event_input_after_success(
}
}
fn build_tracking_event_input(
draft: TrackingEventDraft,
) -> module_runtime::RuntimeTrackingEventInput {
let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000;
let event_id = build_tracking_event_id(&draft, occurred_at_micros);
module_runtime::RuntimeTrackingEventInput {
event_id,
event_key: draft.event_key.to_string(),
scope_kind: draft.scope_kind,
scope_id: draft.scope_id,
user_id: draft.user_id,
owner_user_id: draft.owner_user_id,
profile_id: draft.profile_id,
module_key: draft.module_key.map(str::to_string),
metadata_json: draft.metadata.to_string(),
occurred_at_micros: occurred_at_micros as i64,
}
}
fn build_tracking_event_id(draft: &TrackingEventDraft, occurred_at_micros: i128) -> String {
if draft.event_key == "daily_login"
&& draft.scope_kind == RuntimeTrackingScopeKind::User