feat(api-server): audit external api failures
This commit is contained in:
16
.codegraph/.gitignore
vendored
Normal file
16
.codegraph/.gitignore
vendored
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# CodeGraph data files
|
||||||
|
# These are local to each machine and should not be committed
|
||||||
|
|
||||||
|
# Database
|
||||||
|
*.db
|
||||||
|
*.db-wal
|
||||||
|
*.db-shm
|
||||||
|
|
||||||
|
# Cache
|
||||||
|
cache/
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Hook markers
|
||||||
|
.dirty
|
||||||
143
.codegraph/config.json
Normal file
143
.codegraph/config.json
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"include": [
|
||||||
|
"**/*.ts",
|
||||||
|
"**/*.tsx",
|
||||||
|
"**/*.js",
|
||||||
|
"**/*.jsx",
|
||||||
|
"**/*.py",
|
||||||
|
"**/*.go",
|
||||||
|
"**/*.rs",
|
||||||
|
"**/*.java",
|
||||||
|
"**/*.c",
|
||||||
|
"**/*.h",
|
||||||
|
"**/*.cpp",
|
||||||
|
"**/*.hpp",
|
||||||
|
"**/*.cc",
|
||||||
|
"**/*.cxx",
|
||||||
|
"**/*.cs",
|
||||||
|
"**/*.php",
|
||||||
|
"**/*.rb",
|
||||||
|
"**/*.swift",
|
||||||
|
"**/*.kt",
|
||||||
|
"**/*.kts",
|
||||||
|
"**/*.dart",
|
||||||
|
"**/*.svelte",
|
||||||
|
"**/*.vue",
|
||||||
|
"**/*.liquid",
|
||||||
|
"**/*.pas",
|
||||||
|
"**/*.dpr",
|
||||||
|
"**/*.dpk",
|
||||||
|
"**/*.lpr",
|
||||||
|
"**/*.dfm",
|
||||||
|
"**/*.fmx",
|
||||||
|
"**/*.scala",
|
||||||
|
"**/*.sc"
|
||||||
|
],
|
||||||
|
"exclude": [
|
||||||
|
"**/.git/**",
|
||||||
|
"**/node_modules/**",
|
||||||
|
"**/vendor/**",
|
||||||
|
"**/Pods/**",
|
||||||
|
"**/dist/**",
|
||||||
|
"**/build/**",
|
||||||
|
"**/out/**",
|
||||||
|
"**/bin/**",
|
||||||
|
"**/obj/**",
|
||||||
|
"**/target/**",
|
||||||
|
"**/*.min.js",
|
||||||
|
"**/*.bundle.js",
|
||||||
|
"**/.next/**",
|
||||||
|
"**/.nuxt/**",
|
||||||
|
"**/.svelte-kit/**",
|
||||||
|
"**/.output/**",
|
||||||
|
"**/.turbo/**",
|
||||||
|
"**/.cache/**",
|
||||||
|
"**/.parcel-cache/**",
|
||||||
|
"**/.vite/**",
|
||||||
|
"**/.astro/**",
|
||||||
|
"**/.docusaurus/**",
|
||||||
|
"**/.gatsby/**",
|
||||||
|
"**/.webpack/**",
|
||||||
|
"**/.nx/**",
|
||||||
|
"**/.yarn/cache/**",
|
||||||
|
"**/.pnpm-store/**",
|
||||||
|
"**/storybook-static/**",
|
||||||
|
"**/.expo/**",
|
||||||
|
"**/web-build/**",
|
||||||
|
"**/ios/Pods/**",
|
||||||
|
"**/ios/build/**",
|
||||||
|
"**/android/build/**",
|
||||||
|
"**/android/.gradle/**",
|
||||||
|
"**/__pycache__/**",
|
||||||
|
"**/.venv/**",
|
||||||
|
"**/venv/**",
|
||||||
|
"**/site-packages/**",
|
||||||
|
"**/dist-packages/**",
|
||||||
|
"**/.pytest_cache/**",
|
||||||
|
"**/.mypy_cache/**",
|
||||||
|
"**/.ruff_cache/**",
|
||||||
|
"**/.tox/**",
|
||||||
|
"**/.nox/**",
|
||||||
|
"**/*.egg-info/**",
|
||||||
|
"**/.eggs/**",
|
||||||
|
"**/go/pkg/mod/**",
|
||||||
|
"**/target/debug/**",
|
||||||
|
"**/target/release/**",
|
||||||
|
"**/.gradle/**",
|
||||||
|
"**/.m2/**",
|
||||||
|
"**/generated-sources/**",
|
||||||
|
"**/.kotlin/**",
|
||||||
|
"**/.dart_tool/**",
|
||||||
|
"**/.vs/**",
|
||||||
|
"**/.nuget/**",
|
||||||
|
"**/artifacts/**",
|
||||||
|
"**/publish/**",
|
||||||
|
"**/cmake-build-*/**",
|
||||||
|
"**/CMakeFiles/**",
|
||||||
|
"**/bazel-*/**",
|
||||||
|
"**/vcpkg_installed/**",
|
||||||
|
"**/.conan/**",
|
||||||
|
"**/Debug/**",
|
||||||
|
"**/Release/**",
|
||||||
|
"**/x64/**",
|
||||||
|
"**/.pio/**",
|
||||||
|
"**/release/**",
|
||||||
|
"**/*.app/**",
|
||||||
|
"**/*.asar",
|
||||||
|
"**/DerivedData/**",
|
||||||
|
"**/.build/**",
|
||||||
|
"**/.swiftpm/**",
|
||||||
|
"**/xcuserdata/**",
|
||||||
|
"**/Carthage/Build/**",
|
||||||
|
"**/SourcePackages/**",
|
||||||
|
"**/__history/**",
|
||||||
|
"**/__recovery/**",
|
||||||
|
"**/*.dcu",
|
||||||
|
"**/.composer/**",
|
||||||
|
"**/storage/framework/**",
|
||||||
|
"**/bootstrap/cache/**",
|
||||||
|
"**/.bundle/**",
|
||||||
|
"**/tmp/cache/**",
|
||||||
|
"**/public/assets/**",
|
||||||
|
"**/public/packs/**",
|
||||||
|
"**/.yardoc/**",
|
||||||
|
"**/coverage/**",
|
||||||
|
"**/htmlcov/**",
|
||||||
|
"**/.nyc_output/**",
|
||||||
|
"**/test-results/**",
|
||||||
|
"**/.coverage/**",
|
||||||
|
"**/.idea/**",
|
||||||
|
"**/logs/**",
|
||||||
|
"**/tmp/**",
|
||||||
|
"**/temp/**",
|
||||||
|
"**/_build/**",
|
||||||
|
"**/docs/_build/**",
|
||||||
|
"**/site/**"
|
||||||
|
],
|
||||||
|
"languages": [],
|
||||||
|
"frameworks": [],
|
||||||
|
"maxFileSize": 1048576,
|
||||||
|
"extractDocstrings": true,
|
||||||
|
"trackCallSites": true
|
||||||
|
}
|
||||||
@@ -16,6 +16,23 @@
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## 2026-05-21 外部 API 失败必须 OTLP 上报并落库
|
||||||
|
|
||||||
|
- 背景:图片生成等外部供应商调用失败时,仅返回 502/504 或普通日志无法支持后续按 provider、阶段和重试属性聚合排障。
|
||||||
|
- 决策:外部 API 调用未成功时,`api-server` 必须同时发送 OTLP 失败观测并写入 `tracking_event`。当前通用 VectorEngine `gpt-image-2-all` 图片生成 / 编辑适配器记录 `external_api_call_failure`,`scope_kind = module`、`scope_id = provider`、`module_key = external-api`,metadata 包含 endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount、imageModel 和 rawExcerpt。
|
||||||
|
- 落库方式:优先复用 tracking outbox 异步批量写入;outbox 不可写或因保护阈值拒绝时回退同步直写 SpacetimeDB。不新增 SpacetimeDB 表,不让 reducer 做外部 I/O。
|
||||||
|
- 影响范围:`server-rs/crates/api-server/src/external_api_audit.rs`、`server-rs/crates/api-server/src/openai_image_generation.rs`、`server-rs/crates/api-server/src/telemetry.rs`、tracking outbox、后端架构文档和开发运维文档。
|
||||||
|
- 验证方式:执行 `cargo test -p api-server external_api_audit --manifest-path server-rs/Cargo.toml -- --nocapture`、`cargo test -p api-server openai_image_generation --manifest-path server-rs/Cargo.toml -- --nocapture`、`cargo check -p api-server --manifest-path server-rs/Cargo.toml`、`npm run check:encoding`。
|
||||||
|
- 关联文档:`docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
||||||
|
|
||||||
|
## 2026-05-21 Nginx 通用 API 入口放行创作参考图请求体
|
||||||
|
|
||||||
|
- 背景:release 上拼图结果页重绘动作携带参考图 Data URL 时,Nginx access log 出现 `413`、`request_time=0.000`、`upstream_status=-`,说明请求被反代层默认 1 MiB 上限拦截,未进入 `api-server`。
|
||||||
|
- 决策:发布、开发服和容器 Nginx 模板的通用 `location ~ ^/api(?:/|$)` 统一设置 `client_max_body_size 64m`。该值只作为反代放行上限,具体业务请求体和图片字节上限继续由 `api-server` 路由 `DefaultBodyLimit` 与业务校验控制,不能替代接口级限制。
|
||||||
|
- 影响范围:`deploy/nginx/genarrative.conf`、`deploy/nginx/genarrative-dev-http.conf`、`deploy/container/nginx.conf`、Nginx README、生产运维文档和 release 排障口径。
|
||||||
|
- 验证方式:目标机 `nginx -T 2>/dev/null | grep client_max_body_size` 应看到 `client_max_body_size 64m;`;大于 1 MiB 的参考图请求不再在 Nginx 层直接 413,access log 应出现有效 `upstream_status`。
|
||||||
|
- 关联文档:`deploy/nginx/README.md`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
||||||
|
|
||||||
|
|
||||||
## 2026-05-18 Rust 手写模块入口统一不用 mod.rs
|
## 2026-05-18 Rust 手写模块入口统一不用 mod.rs
|
||||||
|
|
||||||
|
|||||||
@@ -112,6 +112,17 @@ SpacetimeDB bindings 生成:
|
|||||||
npm run spacetime:generate
|
npm run spacetime:generate
|
||||||
```
|
```
|
||||||
|
|
||||||
|
CodeGraph 本地语义索引:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run codegraph:init
|
||||||
|
npm run codegraph:status
|
||||||
|
npm run codegraph:sync
|
||||||
|
npm run codegraph:index
|
||||||
|
```
|
||||||
|
|
||||||
|
`.codegraph/config.json` 可随仓库共享;`.codegraph/codegraph.db`、缓存和日志为本机生成物,不提交。Codex CLI / Cursor / Claude Code 等 MCP 客户端配置属于个人环境;需要时由成员本机执行 `codegraph install` 或查看 `codegraph install --print-config codex`,不要提交个人全局配置。
|
||||||
|
|
||||||
## 常用检查命令
|
## 常用检查命令
|
||||||
|
|
||||||
- 后端通用用户行为埋点统一通过 `record_tracking_event_and_return` procedure、`SpacetimeRuntimeClient::record_tracking_event(...)` 与 api-server `tracking` 中间件写入 `tracking_event` / `tracking_daily_stat`;后台、RPG、大鱼吃小鱼、Visual Novel、Story、Combat 默认排除;作品级游玩埋点统一使用 `work_play_start`,详细事件清单见 `docs/technical/BACKEND_TRACKING_EVENT_COVERAGE_2026-05-09.md`。
|
- 后端通用用户行为埋点统一通过 `record_tracking_event_and_return` procedure、`SpacetimeRuntimeClient::record_tracking_event(...)` 与 api-server `tracking` 中间件写入 `tracking_event` / `tracking_daily_stat`;后台、RPG、大鱼吃小鱼、Visual Novel、Story、Combat 默认排除;作品级游玩埋点统一使用 `work_play_start`,详细事件清单见 `docs/technical/BACKEND_TRACKING_EVENT_COVERAGE_2026-05-09.md`。
|
||||||
|
|||||||
@@ -54,6 +54,22 @@
|
|||||||
- 验证:`tr '\0' '\n' < /proc/$(systemctl show genarrative-api.service -p MainPID --value)/environ | grep GENARRATIVE_TRACKING_OUTBOX_DIR` 应指向 `/var/lib/genarrative/tracking-outbox`;重启后当前 PID 不再出现 `Permission denied (os error 13)`。
|
- 验证:`tr '\0' '\n' < /proc/$(systemctl show genarrative-api.service -p MainPID --value)/environ | grep GENARRATIVE_TRACKING_OUTBOX_DIR` 应指向 `/var/lib/genarrative/tracking-outbox`;重启后当前 PID 不再出现 `Permission denied (os error 13)`。
|
||||||
- 关联:`scripts/deploy/production-api-deploy.sh`、`scripts/jenkins-server-provision.sh`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
- 关联:`scripts/deploy/production-api-deploy.sh`、`scripts/jenkins-server-provision.sh`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
||||||
|
|
||||||
|
## 外部 API 失败没法追溯先查 external_api_call_failure
|
||||||
|
|
||||||
|
- 现象:VectorEngine 图片生成 / 编辑接口对前端只表现为 `502` / `504` 或“上游服务请求失败”,但难以区分是请求发送失败、上游 429/5xx、响应解析失败、未返回图片,还是下载图片失败。
|
||||||
|
- 原因:外部 API 失败如果只靠普通日志,不一定能和 OTLP 指标、trace 与 SpacetimeDB 历史查询稳定关联;重启后也容易丢失上下文。
|
||||||
|
- 处理:先查 OTLP 指标 `genarrative.external_api.failures{provider,failure_stage,status_class,retryable}`,再查 `tracking_event` 中 `event_key = 'external_api_call_failure'` 的 `metadata_json`。当前通用 VectorEngine `gpt-image-2-all` 适配器会记录 provider、endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount、imageModel 和 rawExcerpt。
|
||||||
|
- 验证:`SELECT event_id, scope_id AS provider, metadata_json, occurred_at FROM tracking_event WHERE event_key = 'external_api_call_failure' ORDER BY occurred_at DESC LIMIT 50;`;如果查不到同时看 tracking outbox 目录权限和 sealed 文件是否堆积。
|
||||||
|
- 关联:`server-rs/crates/api-server/src/external_api_audit.rs`、`server-rs/crates/api-server/src/openai_image_generation.rs`、`docs/【后端架构】server-rs与SpacetimeDB数据契约-2026-05-15.md`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
||||||
|
|
||||||
|
## release 创作接口 413 先查 Nginx 请求体上限
|
||||||
|
|
||||||
|
- 现象:release 上 `POST /api/runtime/puzzle/agent/sessions/{session_id}/actions` 携带参考图 Data URL 时返回 `413 Request Entity Too Large`,access log 显示 `request_time=0.000`、`upstream_status=-`。
|
||||||
|
- 原因:Nginx 默认 `client_max_body_size` 只有 1 MiB;请求在反代层被拒绝,根本没有到达 `api-server`,即使 Rust 路由已通过 `DefaultBodyLimit` 放宽到更大的参考图请求体也不会生效。
|
||||||
|
- 处理:在 release、development-http 和容器 Nginx 模板的通用 `/api` location 设置 `client_max_body_size 64m`;该值只负责放行到 `api-server`,真实业务上限继续由路由 `DefaultBodyLimit` 和解码后字节校验承担。发布后运行 `nginx -t && nginx -s reload`。
|
||||||
|
- 验证:`nginx -T 2>/dev/null | grep client_max_body_size` 应能看到 `client_max_body_size 64m;`;再次提交大于 1 MiB 的参考图请求时,access log 应出现正常 `upstream_status`,不再是 Nginx 直接 413。
|
||||||
|
- 关联:`deploy/nginx/genarrative.conf`、`deploy/nginx/genarrative-dev-http.conf`、`deploy/container/nginx.conf`、`deploy/nginx/README.md`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
||||||
|
|
||||||
## 汪汪声浪入口不要再回到独立配置阶段
|
## 汪汪声浪入口不要再回到独立配置阶段
|
||||||
|
|
||||||
- 现象:汪汪声浪入口如果继续切换到独立配置阶段,会和拼图、抓大鹅的创作页内嵌结构不一致,用户会感觉入口跳页。
|
- 现象:汪汪声浪入口如果继续切换到独立配置阶段,会和拼图、抓大鹅的创作页内嵌结构不一致,用户会感觉入口跳页。
|
||||||
|
|||||||
@@ -170,6 +170,8 @@ http {
|
|||||||
|
|
||||||
location ~ ^/api(?:/|$) {
|
location ~ ^/api(?:/|$) {
|
||||||
default_type application/json;
|
default_type application/json;
|
||||||
|
# 中文注释:创作接口会携带参考图 Data URL,Nginx 只放行到 api-server;真实大小限制仍由路由 DefaultBodyLimit 和业务字节校验负责。
|
||||||
|
client_max_body_size 64m;
|
||||||
limit_conn genarrative_api_conn 64;
|
limit_conn genarrative_api_conn 64;
|
||||||
limit_req zone=genarrative_api_rps burst=64 nodelay;
|
limit_req zone=genarrative_api_rps burst=64 nodelay;
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,12 @@
|
|||||||
|
|
||||||
本配置片段由 `scripts/jenkins-server-provision.sh` 在安装 Nginx 站点配置时展开。
|
本配置片段由 `scripts/jenkins-server-provision.sh` 在安装 Nginx 站点配置时展开。
|
||||||
|
|
||||||
|
## 请求体大小
|
||||||
|
|
||||||
|
- 生产、开发服和容器模板都在通用 `location ~ ^/api(?:/|$)` 内设置 `client_max_body_size 64m`。
|
||||||
|
- 该值只用于让携带参考图 Data URL 的创作接口抵达 `api-server`;不要把它当作业务上传上限。Rust 路由仍通过 `DefaultBodyLimit` 和解码后字节校验限制具体接口,例如拼图参考图路由只放宽到 12 MiB 请求体,图片字节继续按业务规则拒绝。
|
||||||
|
- 若线上看到 `413 Request Entity Too Large`,并且 access log 里 `request_time=0.000 upstream_status=-`,通常是 Nginx 没有加载该模板或未 reload;先执行 `nginx -T | grep client_max_body_size` 和 `nginx -t` 再检查 `api-server`。
|
||||||
|
|
||||||
## gzip
|
## gzip
|
||||||
|
|
||||||
- `deploy/nginx/genarrative.conf` 与 `deploy/nginx/genarrative-dev-http.conf` 默认开启 gzip。
|
- `deploy/nginx/genarrative.conf` 与 `deploy/nginx/genarrative-dev-http.conf` 默认开启 gzip。
|
||||||
|
|||||||
@@ -190,6 +190,8 @@ server {
|
|||||||
# 临时兼容主站仍在使用的 /api/* HTTP facade;前端完成 SpacetimeDB SDK 迁移后删除。
|
# 临时兼容主站仍在使用的 /api/* HTTP facade;前端完成 SpacetimeDB SDK 迁移后删除。
|
||||||
location ~ ^/api(?:/|$) {
|
location ~ ^/api(?:/|$) {
|
||||||
default_type application/json;
|
default_type application/json;
|
||||||
|
# 中文注释:创作接口会携带参考图 Data URL,Nginx 只放行到 api-server;真实大小限制仍由路由 DefaultBodyLimit 和业务字节校验负责。
|
||||||
|
client_max_body_size 64m;
|
||||||
limit_conn genarrative_api_conn 64;
|
limit_conn genarrative_api_conn 64;
|
||||||
limit_req zone=genarrative_api_rps burst=64 nodelay;
|
limit_req zone=genarrative_api_rps burst=64 nodelay;
|
||||||
|
|
||||||
|
|||||||
@@ -210,6 +210,8 @@ server {
|
|||||||
# 临时兼容主站仍在使用的 /api/* HTTP facade;前端完成 SpacetimeDB SDK 迁移后删除。
|
# 临时兼容主站仍在使用的 /api/* HTTP facade;前端完成 SpacetimeDB SDK 迁移后删除。
|
||||||
location ~ ^/api(?:/|$) {
|
location ~ ^/api(?:/|$) {
|
||||||
default_type application/json;
|
default_type application/json;
|
||||||
|
# 中文注释:创作接口会携带参考图 Data URL,Nginx 只放行到 api-server;真实大小限制仍由路由 DefaultBodyLimit 和业务字节校验负责。
|
||||||
|
client_max_body_size 64m;
|
||||||
limit_conn genarrative_api_conn 64;
|
limit_conn genarrative_api_conn 64;
|
||||||
limit_req zone=genarrative_api_rps burst=64 nodelay;
|
limit_req zone=genarrative_api_rps burst=64 nodelay;
|
||||||
|
|
||||||
|
|||||||
@@ -158,6 +158,7 @@ npm run check:server-rs-ddd
|
|||||||
- Hyper3D / Rodin:只保留后端安全代理和旧数据兼容;新 Match3D 草稿和批量新增不再生成 GLB。
|
- Hyper3D / Rodin:只保留后端安全代理和旧数据兼容;新 Match3D 草稿和批量新增不再生成 GLB。
|
||||||
- 音频:视觉小说专用音频路由保留;拼图和抓大鹅生成入口暂时关闭,通用 `/api/creation/audio/*` 对相关目标返回 `410 Gone`。
|
- 音频:视觉小说专用音频路由保留;拼图和抓大鹅生成入口暂时关闭,通用 `/api/creation/audio/*` 对相关目标返回 `410 Gone`。
|
||||||
- OSS:私有 generated legacy path 进入浏览器前必须通过 `/api/assets/read-url` 换签;不要裸请求 `/generated-*`。
|
- OSS:私有 generated legacy path 进入浏览器前必须通过 `/api/assets/read-url` 换签;不要裸请求 `/generated-*`。
|
||||||
|
- 外部 API 失败审计:外部供应商调用未成功时,`api-server` 必须发送 OTLP 失败事件并写入 `tracking_event`。当前通用 VectorEngine `gpt-image-2-all` 图片生成 / 编辑适配器在 `request_send`、`response_body`、`upstream_status`、`response_parse`、`missing_image` 和 `image_download` 阶段失败时记录 `external_api_call_failure`,`scope_kind = module`、`scope_id = provider`、`module_key = external-api`;metadata 固定包含 provider、endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount 和 imageModel。入库优先复用 tracking outbox,outbox 不可写或保护阈值拒绝时回退同步写 SpacetimeDB;不得新增前端兜底或在 SpacetimeDB reducer 内做外部 I/O。
|
||||||
|
|
||||||
## SpacetimeDB 表目录
|
## SpacetimeDB 表目录
|
||||||
|
|
||||||
@@ -672,6 +673,7 @@ npm run check:server-rs-ddd
|
|||||||
- Rust 结构体:`TrackingEvent`
|
- Rust 结构体:`TrackingEvent`
|
||||||
- 源码:`server-rs/crates/spacetime-module/src/runtime/profile.rs`
|
- 源码:`server-rs/crates/spacetime-module/src/runtime/profile.rs`
|
||||||
- 写入:关键业务埋点同步调用单条 procedure;普通 HTTP route tracking 由 `api-server` 本机 outbox 批量调用 `record_tracking_events_and_return`。outbox 到达批量阈值时先封存 active 文件并切新 active,后台 worker 异步 flush sealed 文件,HTTP 请求线程不等待 SpacetimeDB。`FLUSH_INTERVAL_MS` 只负责兜底封存长时间未满批的 active 文件,`MAX_BYTES` 只做磁盘保护阈值。`event_id` 必须稳定且全局唯一,批量重试时用唯一索引做幂等跳过。
|
- 写入:关键业务埋点同步调用单条 procedure;普通 HTTP route tracking 由 `api-server` 本机 outbox 批量调用 `record_tracking_events_and_return`。outbox 到达批量阈值时先封存 active 文件并切新 active,后台 worker 异步 flush sealed 文件,HTTP 请求线程不等待 SpacetimeDB。`FLUSH_INTERVAL_MS` 只负责兜底封存长时间未满批的 active 文件,`MAX_BYTES` 只做磁盘保护阈值。`event_id` 必须稳定且全局唯一,批量重试时用唯一索引做幂等跳过。
|
||||||
|
- 外部 API 失败:`event_key = external_api_call_failure` 使用同一张表落库;它是供应商失败审计事实,不新增 SpacetimeDB 表,查询时按 `module_key = 'external-api'` 或 `scope_kind = module AND scope_id = '<provider>'` 过滤。
|
||||||
|
|
||||||
### `treasure_record`
|
### `treasure_record`
|
||||||
|
|
||||||
|
|||||||
@@ -94,6 +94,27 @@ SpacetimeDB bindings:
|
|||||||
npm run spacetime:generate
|
npm run spacetime:generate
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## CodeGraph 本地代码索引
|
||||||
|
|
||||||
|
项目已安装 `@colbymchenry/codegraph` 作为开发期依赖,用于在本地生成语义代码索引,辅助 AI / IDE 做符号搜索、调用关系和影响范围分析。索引目录为 `.codegraph/`,其中 `config.json` 可提交,数据库、缓存和日志由 `.codegraph/.gitignore` 保持本机私有。
|
||||||
|
|
||||||
|
首次拉取或需要重建索引时:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install
|
||||||
|
npm run codegraph:init
|
||||||
|
```
|
||||||
|
|
||||||
|
日常使用:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run codegraph:status
|
||||||
|
npm run codegraph:sync
|
||||||
|
npm run codegraph:index
|
||||||
|
```
|
||||||
|
|
||||||
|
若要把 CodeGraph 接到 Codex CLI / Cursor / Claude Code 等 MCP 客户端,按本机 agent 配置执行 `codegraph install` 或参考 `codegraph install --print-config codex` 输出;不要把个人全局 agent 配置、token 或本机绝对路径提交到仓库。Codex CLI 当前没有项目级 MCP 配置,需由使用者在个人 `~/.codex/config.toml` 中配置。
|
||||||
|
|
||||||
## 后端改动验收
|
## 后端改动验收
|
||||||
|
|
||||||
后端代码修改后,按变更范围选择:
|
后端代码修改后,按变更范围选择:
|
||||||
@@ -164,7 +185,7 @@ Windows Stdb module 构建流水线运行在 Jenkins `windows` 节点上。该
|
|||||||
- Windows 下载阶段如果出现 `curl: (18)` 或响应体截断,流水线会保留同名 `.download` 临时文件并用 `curl -C -` 断点续传;只有完整返回但 SHA256 digest 仍不匹配时才删除临时文件后重新下载。目标 Linux 节点仍只接收 `stash/unstash` 带过去的本地下载件,不回退外网下载。
|
- Windows 下载阶段如果出现 `curl: (18)` 或响应体截断,流水线会保留同名 `.download` 临时文件并用 `curl -C -` 断点续传;只有完整返回但 SHA256 digest 仍不匹配时才删除临时文件后重新下载。目标 Linux 节点仍只接收 `stash/unstash` 带过去的本地下载件,不回退外网下载。
|
||||||
- Windows 下载阶段如果走代理,在 `Genarrative-Server-Provision` 参数 `PROVISION_DOWNLOAD_PROXY` 填写 Windows Jenkins 节点可访问的 HTTP 代理,例如 `http://127.0.0.1:7890`;不要填写目标 release 机器视角的 `127.0.0.1`,除非代理确实运行在该 Windows 节点本机。Linux 目标机阶段会强制要求使用本地下载件,缺少文件直接失败,不再回退到外网下载。
|
- Windows 下载阶段如果走代理,在 `Genarrative-Server-Provision` 参数 `PROVISION_DOWNLOAD_PROXY` 填写 Windows Jenkins 节点可访问的 HTTP 代理,例如 `http://127.0.0.1:7890`;不要填写目标 release 机器视角的 `127.0.0.1`,除非代理确实运行在该 Windows 节点本机。Linux 目标机阶段会强制要求使用本地下载件,缺少文件直接失败,不再回退到外网下载。
|
||||||
- `otelcol-contrib.service` 作为可选系统服务加入 provision,默认监听 `127.0.0.1:4317/4318` 并使用 `deploy/otelcol/genarrative-debug.yaml`。api-server 是否发送 OTLP 仍由 `GENARRATIVE_OTEL_ENABLED` 控制,服务 unit 见 `deploy/systemd/otelcol-contrib.service`。
|
- `otelcol-contrib.service` 作为可选系统服务加入 provision,默认监听 `127.0.0.1:4317/4318` 并使用 `deploy/otelcol/genarrative-debug.yaml`。api-server 是否发送 OTLP 仍由 `GENARRATIVE_OTEL_ENABLED` 控制,服务 unit 见 `deploy/systemd/otelcol-contrib.service`。
|
||||||
- Nginx `/api/` 与 `/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`,upstream keepalive 为 64;`limit_conn` 负责连接 / 并发保护,`limit_req` 负责入口 RPS 快拒绝。当前模板把公开 gallery list 单独放到 `genarrative_gallery_rps`,默认 `rate=5000r/s`、`burst=4096`、`limit_conn=320`;公开详情和普通 API 放到 `genarrative_api_rps`,后台 API 放到 `genarrative_admin_rps`。`limit_conn_status 429` 和 `limit_req_status 429` 必须在 HTTP 与 HTTPS server 中同时生效;若线上压测看到 `limiting connections by zone "genarrative_api_conn"` 却返回 503,优先检查 `nginx -T` 里 HTTPS server 是否缺少这些状态码,以及 `/api/runtime/puzzle/gallery` 是否误落到通用 `location ~ ^/api` 的 `limit_conn=64`。压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time`、`upstream_connect_time`、`upstream_header_time`、`upstream_response_time`、`upstream_status`、`request_id`。
|
- Nginx `/api/` 与 `/admin/api/` 通过 `genarrative_api` upstream 代理到 `127.0.0.1:8082`,upstream keepalive 为 64;`limit_conn` 负责连接 / 并发保护,`limit_req` 负责入口 RPS 快拒绝。当前模板把公开 gallery list 单独放到 `genarrative_gallery_rps`,默认 `rate=5000r/s`、`burst=4096`、`limit_conn=320`;公开详情和普通 API 放到 `genarrative_api_rps`,后台 API 放到 `genarrative_admin_rps`。通用 `/api` location 设置 `client_max_body_size 64m`,只负责允许拼图、抓大鹅、Hyper3D 等创作接口携带参考图 Data URL 抵达 `api-server`;真实业务上限仍由 Rust 路由 `DefaultBodyLimit` 与解码后字节校验控制。若线上出现 `413 Request Entity Too Large` 且 access log 中 `request_time=0.000`、`upstream_status=-`,说明请求在 Nginx 层被拦截,先用 `nginx -T | grep client_max_body_size` 检查 release 模板是否已渲染并 reload。`limit_conn_status 429` 和 `limit_req_status 429` 必须在 HTTP 与 HTTPS server 中同时生效;若线上压测看到 `limiting connections by zone "genarrative_api_conn"` 却返回 503,优先检查 `nginx -T` 里 HTTPS server 是否缺少这些状态码,以及 `/api/runtime/puzzle/gallery` 是否误落到通用 `location ~ ^/api` 的 `limit_conn=64`。压测时看 `/var/log/nginx/genarrative.access.log` 中的 `request_time`、`upstream_connect_time`、`upstream_header_time`、`upstream_response_time`、`upstream_status`、`request_id`。
|
||||||
- 作品列表 K6 脚本一次 iteration 默认请求两个公开接口,因此约 50 HTTP req/s 的目标命令使用 `SCENARIO=spike START_RPS=5 PEAK_RPS=25 HOLD=60s END_RPS=5 DETAIL_RATIO=0 npm run loadtest:k6:works`。
|
- 作品列表 K6 脚本一次 iteration 默认请求两个公开接口,因此约 50 HTTP req/s 的目标命令使用 `SCENARIO=spike START_RPS=5 PEAK_RPS=25 HOLD=60s END_RPS=5 DETAIL_RATIO=0 npm run loadtest:k6:works`。
|
||||||
- 作品列表短期继续由 `api-server` / BFF 订阅 SpacetimeDB 公开 read model 后读本地 cache,不让浏览器前端直接订阅完整列表;未来如新增 `public_work_gallery_entry` 等专用公开作品列表 read model,前端只可订阅稳定、低基数、公开的专用投影,禁止订阅 `puzzle_work_profile`、`custom_world_profile` 等玩法源表后自行 join、聚合或判断权限。前端直订阅落地前必须先补齐权限、字段契约、排序 / 分页、埋点和 BFF 回退策略。
|
- 作品列表短期继续由 `api-server` / BFF 订阅 SpacetimeDB 公开 read model 后读本地 cache,不让浏览器前端直接订阅完整列表;未来如新增 `public_work_gallery_entry` 等专用公开作品列表 read model,前端只可订阅稳定、低基数、公开的专用投影,禁止订阅 `puzzle_work_profile`、`custom_world_profile` 等玩法源表后自行 join、聚合或判断权限。前端直订阅落地前必须先补齐权限、字段契约、排序 / 分页、埋点和 BFF 回退策略。
|
||||||
- 50 HTTP req/s 验收目标为 `http_req_failed < 1%`、`p95 < 2s`、`dropped_iterations = 0`,同时压测窗口内 Nginx 无新增 502。2026-05-19 容器 2C / 2G 连续 10 轮不重启 SpacetimeDB 压测:`PEAK_RPS=2500` 等价约 5000 HTTP req/s,平均实际吞吐约 `4219 HTTP req/s`,10 轮总计 `1,897,357` 个 200、`212,542` 个 429、`0` 个 5xx,200 请求平均 `p95=123ms`、`p99=234ms`;该档会把 SpacetimeDB 容器内存从约 `366MiB` 推到约 `885MiB / 896MiB`,因此当前不要继续抬公开 gallery 入口并发,应优先处理 SpacetimeDB 侧连接 / 订阅 / tracking 写入后的内存高水位。
|
- 50 HTTP req/s 验收目标为 `http_req_failed < 1%`、`p95 < 2s`、`dropped_iterations = 0`,同时压测窗口内 Nginx 无新增 502。2026-05-19 容器 2C / 2G 连续 10 轮不重启 SpacetimeDB 压测:`PEAK_RPS=2500` 等价约 5000 HTTP req/s,平均实际吞吐约 `4219 HTTP req/s`,10 轮总计 `1,897,357` 个 200、`212,542` 个 429、`0` 个 5xx,200 请求平均 `p95=123ms`、`p99=234ms`;该档会把 SpacetimeDB 容器内存从约 `366MiB` 推到约 `885MiB / 896MiB`,因此当前不要继续抬公开 gallery 入口并发,应优先处理 SpacetimeDB 侧连接 / 订阅 / tracking 写入后的内存高水位。
|
||||||
@@ -193,6 +214,7 @@ OpenTelemetry 现阶段默认开启 OTLP traces / metrics / logs,但本地日
|
|||||||
- debug exporter / Rider 转发都会同时接收 traces、metrics 和 logs。
|
- debug exporter / Rider 转发都会同时接收 traces、metrics 和 logs。
|
||||||
- api-server 会随 metrics 发送进程级指标:`process.memory.usage`、`process.memory.virtual`、`process.cpu.time`、`genarrative.process.cpu.usage_percent`、`process.thread.count`、`genarrative.process.memory.private`;Windows 额外发送 `process.windows.handle.count`,Linux 额外发送 `process.unix.file_descriptor.count`。这些指标只描述当前进程,不携带请求、用户或作品 label。
|
- api-server 会随 metrics 发送进程级指标:`process.memory.usage`、`process.memory.virtual`、`process.cpu.time`、`genarrative.process.cpu.usage_percent`、`process.thread.count`、`genarrative.process.memory.private`;Windows 额外发送 `process.windows.handle.count`,Linux 额外发送 `process.unix.file_descriptor.count`。这些指标只描述当前进程,不携带请求、用户或作品 label。
|
||||||
- HTTP 运行态补充发送 `genarrative.http.server.response_bodies.in_flight` 与 `genarrative.http.server.request_permits.available`,后者带低基数 `pool=default|gallery|detail|admin` label,用于区分业务 handler / 背压 permit 是否仍被占用;拼图广场热点缓存补充发送 `genarrative.puzzle_gallery.cache.*` 指标,记录 fresh hit、stale hit、未命中、后台刷新开始 / 失败、重建耗时和预序列化 data JSON 字节数。
|
- HTTP 运行态补充发送 `genarrative.http.server.response_bodies.in_flight` 与 `genarrative.http.server.request_permits.available`,后者带低基数 `pool=default|gallery|detail|admin` label,用于区分业务 handler / 背压 permit 是否仍被占用;拼图广场热点缓存补充发送 `genarrative.puzzle_gallery.cache.*` 指标,记录 fresh hit、stale hit、未命中、后台刷新开始 / 失败、重建耗时和预序列化 data JSON 字节数。
|
||||||
|
- 外部 API 失败统一发送 OTLP 并落库。当前 VectorEngine `gpt-image-2-all` 图片生成 / 编辑失败会输出 `外部 API 调用失败` trace/log,并记录指标 `genarrative.external_api.failures{provider,failure_stage,status_class,retryable}`;同时写入 `tracking_event`,`event_key = external_api_call_failure`、`module_key = external-api`、`scope_kind = module`、`scope_id = provider`。排障时先按 provider / failureStage 聚合,再结合 request 日志和上游响应 excerpt 判断是限流、超时、解析失败还是未返回图片。
|
||||||
- SpacetimeDB 观测分为两类:procedure / reducer 调用继续用 `genarrative.spacetime.procedure.*`,订阅本地 cache 读使用 `genarrative.spacetime.read.*`。`read=list_puzzle_gallery` 表示拼图广场当前从 `puzzle_gallery_card_view` 本地 cache 读取,不再每个 HTTP 请求调用 `list_puzzle_gallery` procedure。
|
- SpacetimeDB 观测分为两类:procedure / reducer 调用继续用 `genarrative.spacetime.procedure.*`,订阅本地 cache 读使用 `genarrative.spacetime.read.*`。`read=list_puzzle_gallery` 表示拼图广场当前从 `puzzle_gallery_card_view` 本地 cache 读取,不再每个 HTTP 请求调用 `list_puzzle_gallery` procedure。
|
||||||
- 本地 Windows 直连压测的内存高水位要结合 K6 VU / 连接数解释。250 RPS 下过高 `PREALLOCATED_VUS` 可能让 300 个本地 Established 连接把 `api-server` private memory 瞬时推到 GB 级,且 `/healthz` 小响应也能复现;若压测结束后回落、`response_bodies.in_flight` 和背压 permit 未显示业务积压,应优先按连接 / 发送链路高水位处理,而不是判断为 SpacetimeDB 或 JSON 缓存泄漏。
|
- 本地 Windows 直连压测的内存高水位要结合 K6 VU / 连接数解释。250 RPS 下过高 `PREALLOCATED_VUS` 可能让 300 个本地 Established 连接把 `api-server` private memory 瞬时推到 GB 级,且 `/healthz` 小响应也能复现;若压测结束后回落、`response_bodies.in_flight` 和背压 permit 未显示业务积压,应优先按连接 / 发送链路高水位处理,而不是判断为 SpacetimeDB 或 JSON 缓存泄漏。
|
||||||
- Rider 的 Logs 面板只展示 log event 自身字段,不会自动展开父 span 的全部 attributes;请求完成日志会直接带 `request_id`、`http.request.method`、`http.route`、`url.scheme`、`url.path`、`http.response.status_code`、`status_class`、`latency_ms` 和 `slow_request`,完整链路继续到 Traces 面板按 trace/span 查看。
|
- Rider 的 Logs 面板只展示 log event 自身字段,不会自动展开父 span 的全部 attributes;请求完成日志会直接带 `request_id`、`http.request.method`、`http.route`、`url.scheme`、`url.path`、`http.response.status_code`、`status_class`、`latency_ms` 和 `slow_request`,完整链路继续到 Traces 面板按 trace/span 查看。
|
||||||
@@ -248,6 +270,16 @@ cargo test -p platform-auth --manifest-path server-rs/Cargo.toml aliyun_send_sms
|
|||||||
|
|
||||||
个人任务首版 scope 仅支持 `user`。后台、RPG、大鱼吃小鱼、Visual Novel、Story、Combat 等特定链路按 tracking 中间件排除规则处理;作品游玩统一使用 `work_play_start`。
|
个人任务首版 scope 仅支持 `user`。后台、RPG、大鱼吃小鱼、Visual Novel、Story、Combat 等特定链路按 tracking 中间件排除规则处理;作品游玩统一使用 `work_play_start`。
|
||||||
|
|
||||||
|
外部 API 失败审计复用 `tracking_event`,不新增表。失败事件优先写入本机 tracking outbox,再由后台 worker 批量落库;如果 outbox 因权限、磁盘或保护阈值不可写,会回退同步直写 SpacetimeDB。`metadata_json` 包含 endpoint、operation、failureStage、statusCode、statusClass、timeout、retryable、errorMessage、latencyMs、promptChars、referenceImageCount、imageModel 和 rawExcerpt。常用查询:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT event_id, scope_id AS provider, metadata_json, occurred_at
|
||||||
|
FROM tracking_event
|
||||||
|
WHERE event_key = 'external_api_call_failure'
|
||||||
|
ORDER BY occurred_at DESC
|
||||||
|
LIMIT 50;
|
||||||
|
```
|
||||||
|
|
||||||
tracking outbox 默认配置:
|
tracking outbox 默认配置:
|
||||||
|
|
||||||
```env
|
```env
|
||||||
|
|||||||
1061
package-lock.json
generated
1061
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -57,7 +57,11 @@
|
|||||||
"check:data": "node scripts/run-tsx.cjs scripts/validate-content.ts",
|
"check:data": "node scripts/run-tsx.cjs scripts/validate-content.ts",
|
||||||
"check:overrides": "node scripts/run-tsx.cjs scripts/validate-overrides.ts",
|
"check:overrides": "node scripts/run-tsx.cjs scripts/validate-overrides.ts",
|
||||||
"check:smoke": "node scripts/run-tsx.cjs scripts/smoke-content.ts",
|
"check:smoke": "node scripts/run-tsx.cjs scripts/smoke-content.ts",
|
||||||
"check:content": "npm run check:data && npm run check:overrides && npm run check:smoke"
|
"check:content": "npm run check:data && npm run check:overrides && npm run check:smoke",
|
||||||
|
"codegraph:init": "codegraph init -i .",
|
||||||
|
"codegraph:index": "codegraph index .",
|
||||||
|
"codegraph:sync": "codegraph sync .",
|
||||||
|
"codegraph:status": "codegraph status ."
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@tailwindcss/vite": "^4.1.14",
|
"@tailwindcss/vite": "^4.1.14",
|
||||||
@@ -73,6 +77,7 @@
|
|||||||
"vite": "^6.2.0"
|
"vite": "^6.2.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
"@colbymchenry/codegraph": "^0.8.0",
|
||||||
"@testing-library/react": "^16.3.2",
|
"@testing-library/react": "^16.3.2",
|
||||||
"@testing-library/user-event": "^14.6.1",
|
"@testing-library/user-event": "^14.6.1",
|
||||||
"@types/node": "^22.14.0",
|
"@types/node": "^22.14.0",
|
||||||
|
|||||||
@@ -1049,6 +1049,7 @@ mod tests {
|
|||||||
base_url: "https://vector.example".to_string(),
|
base_url: "https://vector.example".to_string(),
|
||||||
api_key: "secret".to_string(),
|
api_key: "secret".to_string(),
|
||||||
request_timeout_ms: 180_000,
|
request_timeout_ms: 180_000,
|
||||||
|
external_api_audit_state: None,
|
||||||
});
|
});
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
372
server-rs/crates/api-server/src/external_api_audit.rs
Normal file
372
server-rs/crates/api-server/src/external_api_audit.rs
Normal file
@@ -0,0 +1,372 @@
|
|||||||
|
use axum::http::StatusCode;
|
||||||
|
use module_runtime::RuntimeTrackingScopeKind;
|
||||||
|
use serde_json::{Value, json};
|
||||||
|
use time::OffsetDateTime;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::{state::AppState, tracking::TrackingEventDraft};
|
||||||
|
|
||||||
|
pub(crate) const EXTERNAL_API_FAILURE_EVENT_KEY: &str = "external_api_call_failure";
|
||||||
|
pub(crate) const EXTERNAL_API_AUDIT_MODULE_KEY: &str = "external-api";
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub(crate) struct ExternalApiFailureDraft {
|
||||||
|
pub(crate) provider: &'static str,
|
||||||
|
pub(crate) endpoint: String,
|
||||||
|
pub(crate) operation: String,
|
||||||
|
pub(crate) failure_stage: &'static str,
|
||||||
|
pub(crate) status_code: Option<u16>,
|
||||||
|
pub(crate) status_class: Option<&'static str>,
|
||||||
|
pub(crate) timeout: bool,
|
||||||
|
pub(crate) retryable: bool,
|
||||||
|
pub(crate) error_message: String,
|
||||||
|
pub(crate) error_source: Option<String>,
|
||||||
|
pub(crate) raw_excerpt: Option<String>,
|
||||||
|
pub(crate) latency_ms: Option<u64>,
|
||||||
|
pub(crate) prompt_chars: Option<usize>,
|
||||||
|
pub(crate) reference_image_count: Option<usize>,
|
||||||
|
pub(crate) image_model: Option<&'static str>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExternalApiFailureDraft {
|
||||||
|
pub(crate) fn new(
|
||||||
|
provider: &'static str,
|
||||||
|
endpoint: impl Into<String>,
|
||||||
|
operation: impl Into<String>,
|
||||||
|
failure_stage: &'static str,
|
||||||
|
error_message: impl Into<String>,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
provider,
|
||||||
|
endpoint: endpoint.into(),
|
||||||
|
operation: operation.into(),
|
||||||
|
failure_stage,
|
||||||
|
status_code: None,
|
||||||
|
status_class: None,
|
||||||
|
timeout: false,
|
||||||
|
retryable: false,
|
||||||
|
error_message: error_message.into(),
|
||||||
|
error_source: None,
|
||||||
|
raw_excerpt: None,
|
||||||
|
latency_ms: None,
|
||||||
|
prompt_chars: None,
|
||||||
|
reference_image_count: None,
|
||||||
|
image_model: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_status_code(mut self, status_code: Option<u16>) -> Self {
|
||||||
|
self.status_code = status_code;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_optional_status_class(mut self, status_class: Option<&'static str>) -> Self {
|
||||||
|
self.status_class = status_class;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_timeout(mut self, timeout: bool) -> Self {
|
||||||
|
self.timeout = timeout;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_retryable(mut self, retryable: bool) -> Self {
|
||||||
|
self.retryable = retryable;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_error_source(mut self, error_source: Option<String>) -> Self {
|
||||||
|
self.error_source = error_source;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_raw_excerpt(mut self, raw_excerpt: Option<String>) -> Self {
|
||||||
|
self.raw_excerpt = raw_excerpt;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_latency_ms(mut self, latency_ms: Option<u64>) -> Self {
|
||||||
|
self.latency_ms = latency_ms;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_prompt_chars(mut self, prompt_chars: Option<usize>) -> Self {
|
||||||
|
self.prompt_chars = prompt_chars;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_reference_image_count(
|
||||||
|
mut self,
|
||||||
|
reference_image_count: Option<usize>,
|
||||||
|
) -> Self {
|
||||||
|
self.reference_image_count = reference_image_count;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_image_model(mut self, image_model: Option<&'static str>) -> Self {
|
||||||
|
self.image_model = image_model;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 中文注释:下载图片、OSS 读写等非标准 HTTP 状态统一显式归类,避免 OTLP 低基数 label 误落到 `transport`。
|
||||||
|
pub(crate) fn app_error_status_class(status_code: StatusCode) -> &'static str {
|
||||||
|
status_class(Some(status_code.as_u16()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// 中文注释:外部供应商失败同时进入 OTLP 和 tracking_event;失败审计不能反向阻断主业务错误返回。
|
||||||
|
pub(crate) async fn record_external_api_failure(state: &AppState, draft: ExternalApiFailureDraft) {
|
||||||
|
record_external_api_failure_otlp(&draft);
|
||||||
|
|
||||||
|
let tracking_event = build_external_api_failure_tracking_draft(&draft);
|
||||||
|
if let Some(outbox) = state.tracking_outbox() {
|
||||||
|
match outbox
|
||||||
|
.enqueue(crate::tracking::build_tracking_event_input(
|
||||||
|
tracking_event.clone(),
|
||||||
|
))
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Enqueued) => {}
|
||||||
|
Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Dropped { reason }) => {
|
||||||
|
tracing::warn!(
|
||||||
|
provider = draft.provider,
|
||||||
|
endpoint = %draft.endpoint,
|
||||||
|
operation = %draft.operation,
|
||||||
|
failure_stage = draft.failure_stage,
|
||||||
|
reason,
|
||||||
|
"外部 API 失败审计写入 outbox 被保护阈值拒绝,回退同步直写 SpacetimeDB"
|
||||||
|
);
|
||||||
|
crate::tracking::record_tracking_event_after_success(
|
||||||
|
state,
|
||||||
|
&audit_request_context(),
|
||||||
|
tracking_event,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
tracing::warn!(
|
||||||
|
provider = draft.provider,
|
||||||
|
endpoint = %draft.endpoint,
|
||||||
|
operation = %draft.operation,
|
||||||
|
failure_stage = draft.failure_stage,
|
||||||
|
error = %error,
|
||||||
|
"外部 API 失败审计写入 outbox 失败,回退同步直写 SpacetimeDB"
|
||||||
|
);
|
||||||
|
crate::tracking::record_tracking_event_after_success(
|
||||||
|
state,
|
||||||
|
&audit_request_context(),
|
||||||
|
tracking_event,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::tracking::record_tracking_event_after_success(
|
||||||
|
state,
|
||||||
|
&audit_request_context(),
|
||||||
|
tracking_event,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn build_external_api_failure_tracking_draft(
|
||||||
|
failure: &ExternalApiFailureDraft,
|
||||||
|
) -> TrackingEventDraft {
|
||||||
|
let mut draft = TrackingEventDraft::new(
|
||||||
|
EXTERNAL_API_FAILURE_EVENT_KEY,
|
||||||
|
EXTERNAL_API_AUDIT_MODULE_KEY,
|
||||||
|
);
|
||||||
|
draft.scope_kind = RuntimeTrackingScopeKind::Module;
|
||||||
|
draft.scope_id = failure.provider.to_string();
|
||||||
|
draft.metadata = build_external_api_failure_metadata(failure);
|
||||||
|
draft
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_external_api_failure_metadata(failure: &ExternalApiFailureDraft) -> Value {
|
||||||
|
let mut metadata = json!({
|
||||||
|
"provider": failure.provider,
|
||||||
|
"endpoint": failure.endpoint,
|
||||||
|
"operation": failure.operation,
|
||||||
|
"failureStage": failure.failure_stage,
|
||||||
|
"statusCode": failure.status_code,
|
||||||
|
"statusClass": failure.status_class.unwrap_or_else(|| status_class(failure.status_code)),
|
||||||
|
"timeout": failure.timeout,
|
||||||
|
"retryable": failure.retryable,
|
||||||
|
"errorMessage": truncate_field(failure.error_message.as_str(), 1_000),
|
||||||
|
"occurredAt": current_utc_iso_text(),
|
||||||
|
});
|
||||||
|
|
||||||
|
if let Some(latency_ms) = failure.latency_ms {
|
||||||
|
metadata["latencyMs"] = json!(latency_ms);
|
||||||
|
}
|
||||||
|
if let Some(prompt_chars) = failure.prompt_chars {
|
||||||
|
metadata["promptChars"] = json!(prompt_chars);
|
||||||
|
}
|
||||||
|
if let Some(reference_image_count) = failure.reference_image_count {
|
||||||
|
metadata["referenceImageCount"] = json!(reference_image_count);
|
||||||
|
}
|
||||||
|
if let Some(image_model) = failure.image_model {
|
||||||
|
metadata["imageModel"] = json!(image_model);
|
||||||
|
}
|
||||||
|
if let Some(source) = failure
|
||||||
|
.error_source
|
||||||
|
.as_deref()
|
||||||
|
.map(str::trim)
|
||||||
|
.filter(|value| !value.is_empty())
|
||||||
|
{
|
||||||
|
metadata["errorSource"] = json!(truncate_field(source, 1_000));
|
||||||
|
}
|
||||||
|
if let Some(excerpt) = failure
|
||||||
|
.raw_excerpt
|
||||||
|
.as_deref()
|
||||||
|
.map(str::trim)
|
||||||
|
.filter(|value| !value.is_empty())
|
||||||
|
{
|
||||||
|
metadata["rawExcerpt"] = json!(truncate_field(excerpt, 800));
|
||||||
|
}
|
||||||
|
|
||||||
|
metadata
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn is_retryable_external_api_failure(
|
||||||
|
status_code: Option<u16>,
|
||||||
|
timeout: bool,
|
||||||
|
connect: bool,
|
||||||
|
) -> bool {
|
||||||
|
timeout
|
||||||
|
|| connect
|
||||||
|
|| status_code.is_some_and(|status| {
|
||||||
|
status == StatusCode::TOO_MANY_REQUESTS.as_u16()
|
||||||
|
|| status == StatusCode::REQUEST_TIMEOUT.as_u16()
|
||||||
|
|| status >= 500
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_external_api_failure_otlp(failure: &ExternalApiFailureDraft) {
|
||||||
|
crate::telemetry::record_external_api_failure(
|
||||||
|
failure.provider,
|
||||||
|
failure.failure_stage,
|
||||||
|
failure
|
||||||
|
.status_class
|
||||||
|
.unwrap_or_else(|| status_class(failure.status_code)),
|
||||||
|
failure.retryable,
|
||||||
|
);
|
||||||
|
|
||||||
|
tracing::error!(
|
||||||
|
provider = failure.provider,
|
||||||
|
endpoint = %failure.endpoint,
|
||||||
|
operation = %failure.operation,
|
||||||
|
failure_stage = failure.failure_stage,
|
||||||
|
status_code = failure.status_code,
|
||||||
|
status_class = failure.status_class.unwrap_or_else(|| status_class(failure.status_code)),
|
||||||
|
timeout = failure.timeout,
|
||||||
|
retryable = failure.retryable,
|
||||||
|
latency_ms = failure.latency_ms,
|
||||||
|
prompt_chars = failure.prompt_chars,
|
||||||
|
reference_image_count = failure.reference_image_count,
|
||||||
|
image_model = failure.image_model,
|
||||||
|
error = %failure.error_message,
|
||||||
|
"外部 API 调用失败"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn status_class(status_code: Option<u16>) -> &'static str {
|
||||||
|
match status_code {
|
||||||
|
Some(100..=199) => "1xx",
|
||||||
|
Some(200..=299) => "2xx",
|
||||||
|
Some(300..=399) => "3xx",
|
||||||
|
Some(400..=499) => "4xx",
|
||||||
|
Some(500..=599) => "5xx",
|
||||||
|
Some(_) => "unknown",
|
||||||
|
None => "transport",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn audit_request_context() -> crate::request_context::RequestContext {
|
||||||
|
crate::request_context::RequestContext::new(
|
||||||
|
format!("external-api-audit-{}", Uuid::new_v4()),
|
||||||
|
"external-api audit".to_string(),
|
||||||
|
std::time::Duration::ZERO,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn truncate_field(value: &str, max_chars: usize) -> String {
|
||||||
|
value.chars().take(max_chars).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn current_utc_iso_text() -> String {
|
||||||
|
shared_kernel::format_rfc3339(OffsetDateTime::now_utc())
|
||||||
|
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn external_api_failure_tracking_draft_uses_module_scope_and_safe_metadata() {
|
||||||
|
let draft = build_external_api_failure_tracking_draft(
|
||||||
|
&ExternalApiFailureDraft::new(
|
||||||
|
"vector-engine",
|
||||||
|
"https://vector.example/v1/images/generations",
|
||||||
|
"拼图 UI 背景图生成失败",
|
||||||
|
"upstream_status",
|
||||||
|
"上游 429",
|
||||||
|
)
|
||||||
|
.with_status_code(Some(429))
|
||||||
|
.with_retryable(true)
|
||||||
|
.with_latency_ms(Some(1234))
|
||||||
|
.with_prompt_chars(Some(88))
|
||||||
|
.with_reference_image_count(Some(2))
|
||||||
|
.with_image_model(Some("gpt-image-2-all")),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(draft.event_key, EXTERNAL_API_FAILURE_EVENT_KEY);
|
||||||
|
assert_eq!(draft.scope_kind, RuntimeTrackingScopeKind::Module);
|
||||||
|
assert_eq!(draft.scope_id, "vector-engine");
|
||||||
|
assert_eq!(draft.module_key, Some(EXTERNAL_API_AUDIT_MODULE_KEY));
|
||||||
|
|
||||||
|
let metadata = draft.metadata;
|
||||||
|
assert_eq!(metadata["provider"], "vector-engine");
|
||||||
|
assert_eq!(metadata["statusCode"], 429);
|
||||||
|
assert_eq!(metadata["statusClass"], "4xx");
|
||||||
|
assert_eq!(metadata["retryable"], true);
|
||||||
|
assert_eq!(metadata["latencyMs"], 1234);
|
||||||
|
assert_eq!(metadata["promptChars"], 88);
|
||||||
|
assert_eq!(metadata["referenceImageCount"], 2);
|
||||||
|
assert_eq!(metadata["imageModel"], "gpt-image-2-all");
|
||||||
|
assert!(matches!(metadata["occurredAt"], Value::String(_)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn retryable_classification_keeps_transport_and_overload_failures_actionable() {
|
||||||
|
assert!(is_retryable_external_api_failure(None, true, false));
|
||||||
|
assert!(is_retryable_external_api_failure(None, false, true));
|
||||||
|
assert!(is_retryable_external_api_failure(Some(429), false, false));
|
||||||
|
assert!(is_retryable_external_api_failure(Some(502), false, false));
|
||||||
|
assert!(!is_retryable_external_api_failure(Some(400), false, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn app_error_status_class_can_override_successful_upstream_status() {
|
||||||
|
let draft = build_external_api_failure_tracking_draft(
|
||||||
|
&ExternalApiFailureDraft::new(
|
||||||
|
"vector-engine",
|
||||||
|
"https://cdn.example/generated.png",
|
||||||
|
"下载生成图片",
|
||||||
|
"image_download",
|
||||||
|
"下载生成图片失败",
|
||||||
|
)
|
||||||
|
.with_status_code(Some(200))
|
||||||
|
.with_optional_status_class(Some(app_error_status_class(StatusCode::BAD_GATEWAY))),
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(draft.metadata["statusCode"], 200);
|
||||||
|
assert_eq!(draft.metadata["statusClass"], "5xx");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -39,6 +39,7 @@ mod custom_world_rpg_draft_prompts;
|
|||||||
mod edutainment_baby_drawing;
|
mod edutainment_baby_drawing;
|
||||||
mod edutainment_baby_object;
|
mod edutainment_baby_object;
|
||||||
mod error_middleware;
|
mod error_middleware;
|
||||||
|
mod external_api_audit;
|
||||||
pub(crate) mod generated_asset_sheets;
|
pub(crate) mod generated_asset_sheets;
|
||||||
mod generated_image_assets;
|
mod generated_image_assets;
|
||||||
mod health;
|
mod health;
|
||||||
|
|||||||
@@ -1,21 +1,44 @@
|
|||||||
use std::time::Duration;
|
use std::{error::Error, time::Duration};
|
||||||
|
|
||||||
use axum::http::StatusCode;
|
use axum::http::StatusCode;
|
||||||
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
|
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
|
||||||
use reqwest::header;
|
use reqwest::header;
|
||||||
use serde_json::{Map, Value, json};
|
use serde_json::{Map, Value, json};
|
||||||
|
|
||||||
use crate::{http_error::AppError, state::AppState};
|
use crate::{
|
||||||
|
external_api_audit::{
|
||||||
|
ExternalApiFailureDraft, app_error_status_class, is_retryable_external_api_failure,
|
||||||
|
record_external_api_failure,
|
||||||
|
},
|
||||||
|
http_error::AppError,
|
||||||
|
state::AppState,
|
||||||
|
};
|
||||||
|
|
||||||
pub(crate) const GPT_IMAGE_2_MODEL: &str = "gpt-image-2";
|
pub(crate) const GPT_IMAGE_2_MODEL: &str = "gpt-image-2";
|
||||||
pub(crate) const VECTOR_ENGINE_GPT_IMAGE_2_MODEL: &str = "gpt-image-2-all";
|
pub(crate) const VECTOR_ENGINE_GPT_IMAGE_2_MODEL: &str = "gpt-image-2-all";
|
||||||
const VECTOR_ENGINE_PROVIDER: &str = "vector-engine";
|
const VECTOR_ENGINE_PROVIDER: &str = "vector-engine";
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct OpenAiImageSettings {
|
pub(crate) struct OpenAiImageSettings {
|
||||||
pub base_url: String,
|
pub base_url: String,
|
||||||
pub api_key: String,
|
pub api_key: String,
|
||||||
pub request_timeout_ms: u64,
|
pub request_timeout_ms: u64,
|
||||||
|
pub external_api_audit_state: Option<AppState>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for OpenAiImageSettings {
|
||||||
|
fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
formatter
|
||||||
|
.debug_struct("OpenAiImageSettings")
|
||||||
|
.field("base_url", &self.base_url)
|
||||||
|
.field("api_key", &"<redacted>")
|
||||||
|
.field("request_timeout_ms", &self.request_timeout_ms)
|
||||||
|
.field(
|
||||||
|
"external_api_audit_enabled",
|
||||||
|
&self.external_api_audit_state.is_some(),
|
||||||
|
)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
@@ -74,6 +97,7 @@ pub(crate) fn require_openai_image_settings(
|
|||||||
base_url: base_url.to_string(),
|
base_url: base_url.to_string(),
|
||||||
api_key: api_key.to_string(),
|
api_key: api_key.to_string(),
|
||||||
request_timeout_ms: state.config.vector_engine_image_request_timeout_ms.max(1),
|
request_timeout_ms: state.config.vector_engine_image_request_timeout_ms.max(1),
|
||||||
|
external_api_audit_state: Some(state.clone()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,15 +127,18 @@ pub(crate) async fn create_openai_image_generation(
|
|||||||
reference_images: &[String],
|
reference_images: &[String],
|
||||||
failure_context: &str,
|
failure_context: &str,
|
||||||
) -> Result<OpenAiGeneratedImages, AppError> {
|
) -> Result<OpenAiGeneratedImages, AppError> {
|
||||||
|
let request_url = vector_engine_images_generation_url(settings);
|
||||||
|
let normalized_size = normalize_image_size(size);
|
||||||
let request_body = build_openai_image_request_body(
|
let request_body = build_openai_image_request_body(
|
||||||
prompt,
|
prompt,
|
||||||
negative_prompt,
|
negative_prompt,
|
||||||
size,
|
normalized_size.as_str(),
|
||||||
candidate_count,
|
candidate_count,
|
||||||
reference_images,
|
reference_images,
|
||||||
);
|
);
|
||||||
let response = http_client
|
let started_at = std::time::Instant::now();
|
||||||
.post(vector_engine_images_generation_url(settings))
|
let response = match http_client
|
||||||
|
.post(request_url.as_str())
|
||||||
.header(
|
.header(
|
||||||
header::AUTHORIZATION,
|
header::AUTHORIZATION,
|
||||||
format!("Bearer {}", settings.api_key),
|
format!("Bearer {}", settings.api_key),
|
||||||
@@ -121,16 +148,106 @@ pub(crate) async fn create_openai_image_generation(
|
|||||||
.json(&request_body)
|
.json(&request_body)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|error| {
|
{
|
||||||
map_openai_image_request_error(format!(
|
Ok(response) => response,
|
||||||
"{failure_context}:创建图片生成任务失败:{error}"
|
Err(error) => {
|
||||||
))
|
let latency_ms = started_at.elapsed().as_millis() as u64;
|
||||||
})?;
|
let timeout = error.is_timeout();
|
||||||
|
let connect = error.is_connect();
|
||||||
|
let source = error.source().map(ToString::to_string);
|
||||||
|
let message = format!("{failure_context}:创建图片生成任务失败:{error}");
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"request_send",
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
timeout,
|
||||||
|
connect,
|
||||||
|
message.as_str(),
|
||||||
|
source,
|
||||||
|
None,
|
||||||
|
Some(latency_ms),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(reference_images.len()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(map_openai_image_reqwest_error(
|
||||||
|
format!("{failure_context}:创建图片生成任务失败").as_str(),
|
||||||
|
request_url.as_str(),
|
||||||
|
error,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
let response_status = response.status();
|
let response_status = response.status();
|
||||||
let response_text = response.text().await.map_err(|error| {
|
tracing::info!(
|
||||||
map_openai_image_request_error(format!("{failure_context}:读取图片生成响应失败:{error}"))
|
provider = VECTOR_ENGINE_PROVIDER,
|
||||||
})?;
|
endpoint = %request_url,
|
||||||
|
status = response_status.as_u16(),
|
||||||
|
prompt_chars = prompt.chars().count(),
|
||||||
|
size = %normalized_size,
|
||||||
|
reference_image_count = reference_images.len(),
|
||||||
|
elapsed_ms = started_at.elapsed().as_millis() as u64,
|
||||||
|
failure_context,
|
||||||
|
"VectorEngine 图片生成 HTTP 返回"
|
||||||
|
);
|
||||||
|
let response_text = match response.text().await {
|
||||||
|
Ok(response_text) => response_text,
|
||||||
|
Err(error) => {
|
||||||
|
let latency_ms = started_at.elapsed().as_millis() as u64;
|
||||||
|
let timeout = error.is_timeout();
|
||||||
|
let connect = error.is_connect();
|
||||||
|
let source = error.source().map(ToString::to_string);
|
||||||
|
let message = format!("{failure_context}:读取图片生成响应失败:{error}");
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"response_body",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
timeout,
|
||||||
|
connect,
|
||||||
|
message.as_str(),
|
||||||
|
source,
|
||||||
|
None,
|
||||||
|
Some(latency_ms),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(reference_images.len()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(map_openai_image_reqwest_error(
|
||||||
|
format!("{failure_context}:读取图片生成响应失败").as_str(),
|
||||||
|
request_url.as_str(),
|
||||||
|
error,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
if !response_status.is_success() {
|
if !response_status.is_success() {
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"upstream_status",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
parse_api_error_message(response_text.as_str(), failure_context).as_str(),
|
||||||
|
None,
|
||||||
|
Some(truncate_raw(response_text.as_str())),
|
||||||
|
Some(started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(reference_images.len()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
return Err(map_openai_image_upstream_error(
|
return Err(map_openai_image_upstream_error(
|
||||||
response_status.as_u16(),
|
response_status.as_u16(),
|
||||||
response_text.as_str(),
|
response_text.as_str(),
|
||||||
@@ -138,26 +255,114 @@ pub(crate) async fn create_openai_image_generation(
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let response_json = parse_json_payload(response_text.as_str(), failure_context)?;
|
let response_json = match parse_json_payload(response_text.as_str(), failure_context) {
|
||||||
|
Ok(response_json) => response_json,
|
||||||
|
Err(error) => {
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"response_parse",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
error.body_text().as_str(),
|
||||||
|
None,
|
||||||
|
Some(truncate_raw(response_text.as_str())),
|
||||||
|
Some(started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(reference_images.len()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
let generation_id = extract_generation_id(&response_json.payload)
|
let generation_id = extract_generation_id(&response_json.payload)
|
||||||
.unwrap_or_else(|| format!("vector-engine-{}", current_utc_micros()));
|
.unwrap_or_else(|| format!("vector-engine-{}", current_utc_micros()));
|
||||||
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
|
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
|
||||||
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
|
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
|
||||||
let image_urls = extract_image_urls(&response_json.payload);
|
let image_urls = extract_image_urls(&response_json.payload);
|
||||||
if !image_urls.is_empty() {
|
if !image_urls.is_empty() {
|
||||||
let mut generated =
|
let download_started_at = std::time::Instant::now();
|
||||||
download_images_from_urls(http_client, generation_id, image_urls, candidate_count)
|
let mut generated = match download_images_from_urls(
|
||||||
.await?;
|
http_client,
|
||||||
|
generation_id,
|
||||||
|
image_urls,
|
||||||
|
candidate_count,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(generated) => generated,
|
||||||
|
Err(error) => {
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"image_download",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
Some(app_error_status_class(error.status_code())),
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
error.body_text().as_str(),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
Some(download_started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(reference_images.len()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
generated.actual_prompt = actual_prompt;
|
generated.actual_prompt = actual_prompt;
|
||||||
|
tracing::info!(
|
||||||
|
provider = VECTOR_ENGINE_PROVIDER,
|
||||||
|
endpoint = %request_url,
|
||||||
|
image_count = generated.images.len(),
|
||||||
|
elapsed_ms = download_started_at.elapsed().as_millis() as u64,
|
||||||
|
failure_context,
|
||||||
|
"VectorEngine 图片下载完成"
|
||||||
|
);
|
||||||
return Ok(generated);
|
return Ok(generated);
|
||||||
}
|
}
|
||||||
let b64_images = extract_b64_images(&response_json.payload);
|
let b64_images = extract_b64_images(&response_json.payload);
|
||||||
if !b64_images.is_empty() {
|
if !b64_images.is_empty() {
|
||||||
let mut generated = images_from_base64(generation_id, b64_images, candidate_count);
|
let mut generated = images_from_base64(generation_id, b64_images, candidate_count);
|
||||||
generated.actual_prompt = actual_prompt;
|
generated.actual_prompt = actual_prompt;
|
||||||
|
tracing::info!(
|
||||||
|
provider = VECTOR_ENGINE_PROVIDER,
|
||||||
|
endpoint = %request_url,
|
||||||
|
image_count = generated.images.len(),
|
||||||
|
failure_context,
|
||||||
|
"VectorEngine 图片 base64 解码完成"
|
||||||
|
);
|
||||||
return Ok(generated);
|
return Ok(generated);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"missing_image",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
format!("{failure_context}:VectorEngine 未返回图片地址").as_str(),
|
||||||
|
None,
|
||||||
|
Some(truncate_raw(response_text.as_str())),
|
||||||
|
Some(started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(reference_images.len()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
Err(
|
Err(
|
||||||
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
|
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
|
||||||
"provider": VECTOR_ENGINE_PROVIDER,
|
"provider": VECTOR_ENGINE_PROVIDER,
|
||||||
@@ -176,6 +381,8 @@ pub(crate) async fn create_openai_image_edit(
|
|||||||
failure_context: &str,
|
failure_context: &str,
|
||||||
) -> Result<OpenAiGeneratedImages, AppError> {
|
) -> Result<OpenAiGeneratedImages, AppError> {
|
||||||
let task_id = format!("vector-engine-edit-{}", current_utc_micros());
|
let task_id = format!("vector-engine-edit-{}", current_utc_micros());
|
||||||
|
let request_url = vector_engine_images_edit_url(settings);
|
||||||
|
let normalized_size = normalize_image_size(size);
|
||||||
let image_part = reqwest::multipart::Part::bytes(reference_image.bytes.clone())
|
let image_part = reqwest::multipart::Part::bytes(reference_image.bytes.clone())
|
||||||
.file_name(reference_image.file_name.clone())
|
.file_name(reference_image.file_name.clone())
|
||||||
.mime_str(reference_image.mime_type.as_str())
|
.mime_str(reference_image.mime_type.as_str())
|
||||||
@@ -190,9 +397,10 @@ pub(crate) async fn create_openai_image_edit(
|
|||||||
build_prompt_with_negative(prompt, negative_prompt),
|
build_prompt_with_negative(prompt, negative_prompt),
|
||||||
)
|
)
|
||||||
.text("n", "1")
|
.text("n", "1")
|
||||||
.text("size", normalize_image_size(size));
|
.text("size", normalized_size.clone());
|
||||||
let response = http_client
|
let started_at = std::time::Instant::now();
|
||||||
.post(vector_engine_images_edit_url(settings).as_str())
|
let response = match http_client
|
||||||
|
.post(request_url.as_str())
|
||||||
.header(
|
.header(
|
||||||
header::AUTHORIZATION,
|
header::AUTHORIZATION,
|
||||||
format!("Bearer {}", settings.api_key),
|
format!("Bearer {}", settings.api_key),
|
||||||
@@ -201,16 +409,106 @@ pub(crate) async fn create_openai_image_edit(
|
|||||||
.multipart(form)
|
.multipart(form)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|error| {
|
{
|
||||||
map_openai_image_request_error(format!(
|
Ok(response) => response,
|
||||||
"{failure_context}:创建图片编辑任务失败:{error}"
|
Err(error) => {
|
||||||
))
|
let latency_ms = started_at.elapsed().as_millis() as u64;
|
||||||
})?;
|
let timeout = error.is_timeout();
|
||||||
|
let connect = error.is_connect();
|
||||||
|
let source = error.source().map(ToString::to_string);
|
||||||
|
let message = format!("{failure_context}:创建图片编辑任务失败:{error}");
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"request_send",
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
timeout,
|
||||||
|
connect,
|
||||||
|
message.as_str(),
|
||||||
|
source,
|
||||||
|
None,
|
||||||
|
Some(latency_ms),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(1),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(map_openai_image_reqwest_error(
|
||||||
|
format!("{failure_context}:创建图片编辑任务失败").as_str(),
|
||||||
|
request_url.as_str(),
|
||||||
|
error,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
let response_status = response.status();
|
let response_status = response.status();
|
||||||
let response_text = response.text().await.map_err(|error| {
|
tracing::info!(
|
||||||
map_openai_image_request_error(format!("{failure_context}:读取图片编辑响应失败:{error}"))
|
provider = VECTOR_ENGINE_PROVIDER,
|
||||||
})?;
|
endpoint = %request_url,
|
||||||
|
status = response_status.as_u16(),
|
||||||
|
prompt_chars = prompt.chars().count(),
|
||||||
|
size = %normalized_size,
|
||||||
|
reference_image_count = 1usize,
|
||||||
|
elapsed_ms = started_at.elapsed().as_millis() as u64,
|
||||||
|
failure_context,
|
||||||
|
"VectorEngine 图片编辑 HTTP 返回"
|
||||||
|
);
|
||||||
|
let response_text = match response.text().await {
|
||||||
|
Ok(response_text) => response_text,
|
||||||
|
Err(error) => {
|
||||||
|
let latency_ms = started_at.elapsed().as_millis() as u64;
|
||||||
|
let timeout = error.is_timeout();
|
||||||
|
let connect = error.is_connect();
|
||||||
|
let source = error.source().map(ToString::to_string);
|
||||||
|
let message = format!("{failure_context}:读取图片编辑响应失败:{error}");
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"response_body",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
timeout,
|
||||||
|
connect,
|
||||||
|
message.as_str(),
|
||||||
|
source,
|
||||||
|
None,
|
||||||
|
Some(latency_ms),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(1),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(map_openai_image_reqwest_error(
|
||||||
|
format!("{failure_context}:读取图片编辑响应失败").as_str(),
|
||||||
|
request_url.as_str(),
|
||||||
|
error,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
};
|
||||||
if !response_status.is_success() {
|
if !response_status.is_success() {
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"upstream_status",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
parse_api_error_message(response_text.as_str(), failure_context).as_str(),
|
||||||
|
None,
|
||||||
|
Some(truncate_raw(response_text.as_str())),
|
||||||
|
Some(started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(1),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
return Err(map_openai_image_upstream_error(
|
return Err(map_openai_image_upstream_error(
|
||||||
response_status.as_u16(),
|
response_status.as_u16(),
|
||||||
response_text.as_str(),
|
response_text.as_str(),
|
||||||
@@ -218,12 +516,62 @@ pub(crate) async fn create_openai_image_edit(
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let response_json = parse_json_payload(response_text.as_str(), failure_context)?;
|
let response_json = match parse_json_payload(response_text.as_str(), failure_context) {
|
||||||
|
Ok(response_json) => response_json,
|
||||||
|
Err(error) => {
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"response_parse",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
error.body_text().as_str(),
|
||||||
|
None,
|
||||||
|
Some(truncate_raw(response_text.as_str())),
|
||||||
|
Some(started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(1),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
|
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
|
||||||
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
|
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
|
||||||
let image_urls = extract_image_urls(&response_json.payload);
|
let image_urls = extract_image_urls(&response_json.payload);
|
||||||
if !image_urls.is_empty() {
|
if !image_urls.is_empty() {
|
||||||
let mut generated = download_images_from_urls(http_client, task_id, image_urls, 1).await?;
|
let download_started_at = std::time::Instant::now();
|
||||||
|
let mut generated =
|
||||||
|
match download_images_from_urls(http_client, task_id, image_urls, 1).await {
|
||||||
|
Ok(generated) => generated,
|
||||||
|
Err(error) => {
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"image_download",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
Some(app_error_status_class(error.status_code())),
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
error.body_text().as_str(),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
Some(download_started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(1),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
};
|
||||||
generated.actual_prompt = actual_prompt;
|
generated.actual_prompt = actual_prompt;
|
||||||
return Ok(generated);
|
return Ok(generated);
|
||||||
}
|
}
|
||||||
@@ -234,6 +582,25 @@ pub(crate) async fn create_openai_image_edit(
|
|||||||
return Ok(generated);
|
return Ok(generated);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
record_openai_image_failure_if_configured(
|
||||||
|
settings,
|
||||||
|
build_openai_image_failure_audit_draft(
|
||||||
|
request_url.as_str(),
|
||||||
|
failure_context,
|
||||||
|
"missing_image",
|
||||||
|
Some(response_status.as_u16()),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
format!("{failure_context}:VectorEngine 未返回编辑图片").as_str(),
|
||||||
|
None,
|
||||||
|
Some(truncate_raw(response_text.as_str())),
|
||||||
|
Some(started_at.elapsed().as_millis() as u64),
|
||||||
|
Some(prompt.chars().count()),
|
||||||
|
Some(1),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
Err(
|
Err(
|
||||||
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
|
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
|
||||||
"provider": VECTOR_ENGINE_PROVIDER,
|
"provider": VECTOR_ENGINE_PROVIDER,
|
||||||
@@ -402,6 +769,44 @@ fn map_openai_image_request_error(message: String) -> AppError {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn map_openai_image_reqwest_error(
|
||||||
|
context: &str,
|
||||||
|
request_url: &str,
|
||||||
|
error: reqwest::Error,
|
||||||
|
) -> AppError {
|
||||||
|
let is_timeout = error.is_timeout();
|
||||||
|
let is_connect = error.is_connect();
|
||||||
|
let source = error.source().map(ToString::to_string).unwrap_or_default();
|
||||||
|
let message = format!("{context}:{error}");
|
||||||
|
let status = if is_timeout {
|
||||||
|
StatusCode::GATEWAY_TIMEOUT
|
||||||
|
} else {
|
||||||
|
StatusCode::BAD_GATEWAY
|
||||||
|
};
|
||||||
|
tracing::warn!(
|
||||||
|
provider = VECTOR_ENGINE_PROVIDER,
|
||||||
|
endpoint = %request_url,
|
||||||
|
timeout = is_timeout,
|
||||||
|
connect = is_connect,
|
||||||
|
request = error.is_request(),
|
||||||
|
body = error.is_body(),
|
||||||
|
source = %source,
|
||||||
|
message = %message,
|
||||||
|
"VectorEngine 图片请求发送失败"
|
||||||
|
);
|
||||||
|
|
||||||
|
AppError::from_status(status).with_details(json!({
|
||||||
|
"provider": VECTOR_ENGINE_PROVIDER,
|
||||||
|
"message": message,
|
||||||
|
"endpoint": request_url,
|
||||||
|
"timeout": is_timeout,
|
||||||
|
"connect": is_connect,
|
||||||
|
"request": error.is_request(),
|
||||||
|
"body": error.is_body(),
|
||||||
|
"source": source,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
fn map_openai_image_upstream_error(
|
fn map_openai_image_upstream_error(
|
||||||
upstream_status: u16,
|
upstream_status: u16,
|
||||||
raw_text: &str,
|
raw_text: &str,
|
||||||
@@ -423,6 +828,53 @@ fn map_openai_image_upstream_error(
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn record_openai_image_failure_if_configured(
|
||||||
|
settings: &OpenAiImageSettings,
|
||||||
|
draft: ExternalApiFailureDraft,
|
||||||
|
) {
|
||||||
|
if let Some(state) = settings.external_api_audit_state.as_ref() {
|
||||||
|
record_external_api_failure(state, draft).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn build_openai_image_failure_audit_draft(
|
||||||
|
request_url: &str,
|
||||||
|
failure_context: &str,
|
||||||
|
failure_stage: &'static str,
|
||||||
|
status_code: Option<u16>,
|
||||||
|
status_class: Option<&'static str>,
|
||||||
|
timeout: bool,
|
||||||
|
connect: bool,
|
||||||
|
error_message: &str,
|
||||||
|
error_source: Option<String>,
|
||||||
|
raw_excerpt: Option<String>,
|
||||||
|
latency_ms: Option<u64>,
|
||||||
|
prompt_chars: Option<usize>,
|
||||||
|
reference_image_count: Option<usize>,
|
||||||
|
) -> ExternalApiFailureDraft {
|
||||||
|
ExternalApiFailureDraft::new(
|
||||||
|
VECTOR_ENGINE_PROVIDER,
|
||||||
|
request_url.to_string(),
|
||||||
|
failure_context.to_string(),
|
||||||
|
failure_stage,
|
||||||
|
error_message.to_string(),
|
||||||
|
)
|
||||||
|
.with_status_code(status_code)
|
||||||
|
.with_optional_status_class(status_class)
|
||||||
|
.with_timeout(timeout)
|
||||||
|
.with_retryable(is_retryable_external_api_failure(
|
||||||
|
status_code,
|
||||||
|
timeout,
|
||||||
|
connect,
|
||||||
|
))
|
||||||
|
.with_error_source(error_source)
|
||||||
|
.with_raw_excerpt(raw_excerpt)
|
||||||
|
.with_latency_ms(latency_ms)
|
||||||
|
.with_prompt_chars(prompt_chars)
|
||||||
|
.with_reference_image_count(reference_image_count)
|
||||||
|
.with_image_model(Some(VECTOR_ENGINE_GPT_IMAGE_2_MODEL))
|
||||||
|
}
|
||||||
|
|
||||||
fn parse_api_error_message(raw_text: &str, fallback_message: &str) -> String {
|
fn parse_api_error_message(raw_text: &str, fallback_message: &str) -> String {
|
||||||
if raw_text.trim().is_empty() {
|
if raw_text.trim().is_empty() {
|
||||||
return fallback_message.to_string();
|
return fallback_message.to_string();
|
||||||
@@ -629,11 +1081,13 @@ mod tests {
|
|||||||
base_url: "https://vector.example".to_string(),
|
base_url: "https://vector.example".to_string(),
|
||||||
api_key: "test-key".to_string(),
|
api_key: "test-key".to_string(),
|
||||||
request_timeout_ms: 1_000_000,
|
request_timeout_ms: 1_000_000,
|
||||||
|
external_api_audit_state: None,
|
||||||
};
|
};
|
||||||
let v1_settings = OpenAiImageSettings {
|
let v1_settings = OpenAiImageSettings {
|
||||||
base_url: "https://vector.example/v1".to_string(),
|
base_url: "https://vector.example/v1".to_string(),
|
||||||
api_key: "test-key".to_string(),
|
api_key: "test-key".to_string(),
|
||||||
request_timeout_ms: 1_000_000,
|
request_timeout_ms: 1_000_000,
|
||||||
|
external_api_audit_state: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -658,4 +1112,41 @@ mod tests {
|
|||||||
assert_eq!(images.images[0].mime_type, "image/png");
|
assert_eq!(images.images[0].mime_type, "image/png");
|
||||||
assert_eq!(images.images[0].extension, "png");
|
assert_eq!(images.images[0].extension, "png");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn vector_engine_upstream_failure_builds_tracking_ready_audit_event() {
|
||||||
|
let audit = build_openai_image_failure_audit_draft(
|
||||||
|
"https://vector.example/v1/images/generations",
|
||||||
|
"拼图 UI 背景图生成失败",
|
||||||
|
"upstream_status",
|
||||||
|
Some(429),
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
"上游限流",
|
||||||
|
None,
|
||||||
|
Some("{\"error\":\"rate limited\"}".to_string()),
|
||||||
|
Some(321),
|
||||||
|
Some(42),
|
||||||
|
Some(1),
|
||||||
|
);
|
||||||
|
let tracking = crate::external_api_audit::build_external_api_failure_tracking_draft(&audit);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
tracking.event_key,
|
||||||
|
crate::external_api_audit::EXTERNAL_API_FAILURE_EVENT_KEY
|
||||||
|
);
|
||||||
|
assert_eq!(tracking.scope_id, VECTOR_ENGINE_PROVIDER);
|
||||||
|
assert_eq!(tracking.metadata["provider"], VECTOR_ENGINE_PROVIDER);
|
||||||
|
assert_eq!(tracking.metadata["statusCode"], 429);
|
||||||
|
assert_eq!(tracking.metadata["statusClass"], "4xx");
|
||||||
|
assert_eq!(tracking.metadata["failureStage"], "upstream_status");
|
||||||
|
assert_eq!(tracking.metadata["retryable"], true);
|
||||||
|
assert_eq!(tracking.metadata["promptChars"], 42);
|
||||||
|
assert_eq!(tracking.metadata["referenceImageCount"], 1);
|
||||||
|
assert_eq!(
|
||||||
|
tracking.metadata["imageModel"],
|
||||||
|
VECTOR_ENGINE_GPT_IMAGE_2_MODEL
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -172,6 +172,23 @@ pub(crate) fn update_tracking_outbox_pending_files(files: usize) {
|
|||||||
TRACKING_OUTBOX_PENDING_FILES.store(files.min(i64::MAX as usize) as i64, Ordering::Relaxed);
|
TRACKING_OUTBOX_PENDING_FILES.store(files.min(i64::MAX as usize) as i64, Ordering::Relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn record_external_api_failure(
|
||||||
|
provider: &'static str,
|
||||||
|
failure_stage: &'static str,
|
||||||
|
status_class: &'static str,
|
||||||
|
retryable: bool,
|
||||||
|
) {
|
||||||
|
external_api_metrics().failures.add(
|
||||||
|
1,
|
||||||
|
&[
|
||||||
|
KeyValue::new("provider", provider),
|
||||||
|
KeyValue::new("failure_stage", failure_stage),
|
||||||
|
KeyValue::new("status_class", status_class),
|
||||||
|
KeyValue::new("retryable", retryable),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
fn track_response_body_in_flight(response: Response<Body>) -> Response<Body> {
|
fn track_response_body_in_flight(response: Response<Body>) -> Response<Body> {
|
||||||
response.map(|body| {
|
response.map(|body| {
|
||||||
HTTP_RESPONSE_BODY_IN_FLIGHT.fetch_add(1, Ordering::Relaxed);
|
HTTP_RESPONSE_BODY_IN_FLIGHT.fetch_add(1, Ordering::Relaxed);
|
||||||
@@ -211,6 +228,10 @@ struct TrackingOutboxMetrics {
|
|||||||
flushed_bytes: Counter<u64>,
|
flushed_bytes: Counter<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ExternalApiMetrics {
|
||||||
|
failures: Counter<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
struct HttpRequestPermitsAvailableGauges {
|
struct HttpRequestPermitsAvailableGauges {
|
||||||
default: Arc<AtomicI64>,
|
default: Arc<AtomicI64>,
|
||||||
gallery: Arc<AtomicI64>,
|
gallery: Arc<AtomicI64>,
|
||||||
@@ -359,6 +380,21 @@ fn tracking_outbox_metrics() -> &'static TrackingOutboxMetrics {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn external_api_metrics() -> &'static ExternalApiMetrics {
|
||||||
|
static METRICS: std::sync::OnceLock<ExternalApiMetrics> = std::sync::OnceLock::new();
|
||||||
|
METRICS.get_or_init(|| {
|
||||||
|
let meter = global::meter("genarrative-api");
|
||||||
|
ExternalApiMetrics {
|
||||||
|
failures: meter
|
||||||
|
.u64_counter("genarrative.external_api.failures")
|
||||||
|
.with_description(
|
||||||
|
"External API call failures grouped by provider and failure stage",
|
||||||
|
)
|
||||||
|
.build(),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
fn register_http_request_permits_available_metric() -> HttpRequestPermitsAvailableGauges {
|
fn register_http_request_permits_available_metric() -> HttpRequestPermitsAvailableGauges {
|
||||||
let gauges = HttpRequestPermitsAvailableGauges::new();
|
let gauges = HttpRequestPermitsAvailableGauges::new();
|
||||||
let meter = global::meter("genarrative-api");
|
let meter = global::meter("genarrative-api");
|
||||||
|
|||||||
@@ -584,6 +584,26 @@ async fn record_route_tracking_event_via_outbox_after_success(
|
|||||||
record_tracking_event_input_after_success(state, request_context, event).await;
|
record_tracking_event_input_after_success(state, request_context, event).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn build_tracking_event_input(
|
||||||
|
draft: TrackingEventDraft,
|
||||||
|
) -> module_runtime::RuntimeTrackingEventInput {
|
||||||
|
let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000;
|
||||||
|
let event_id = build_tracking_event_id(&draft, occurred_at_micros);
|
||||||
|
|
||||||
|
module_runtime::RuntimeTrackingEventInput {
|
||||||
|
event_id,
|
||||||
|
event_key: draft.event_key.to_string(),
|
||||||
|
scope_kind: draft.scope_kind,
|
||||||
|
scope_id: draft.scope_id,
|
||||||
|
user_id: draft.user_id,
|
||||||
|
owner_user_id: draft.owner_user_id,
|
||||||
|
profile_id: draft.profile_id,
|
||||||
|
module_key: draft.module_key.map(str::to_string),
|
||||||
|
metadata_json: draft.metadata.to_string(),
|
||||||
|
occurred_at_micros: occurred_at_micros as i64,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn record_tracking_event_input_after_success(
|
async fn record_tracking_event_input_after_success(
|
||||||
state: &AppState,
|
state: &AppState,
|
||||||
request_context: &RequestContext,
|
request_context: &RequestContext,
|
||||||
@@ -642,26 +662,6 @@ async fn record_tracking_event_input_after_success(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_tracking_event_input(
|
|
||||||
draft: TrackingEventDraft,
|
|
||||||
) -> module_runtime::RuntimeTrackingEventInput {
|
|
||||||
let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000;
|
|
||||||
let event_id = build_tracking_event_id(&draft, occurred_at_micros);
|
|
||||||
|
|
||||||
module_runtime::RuntimeTrackingEventInput {
|
|
||||||
event_id,
|
|
||||||
event_key: draft.event_key.to_string(),
|
|
||||||
scope_kind: draft.scope_kind,
|
|
||||||
scope_id: draft.scope_id,
|
|
||||||
user_id: draft.user_id,
|
|
||||||
owner_user_id: draft.owner_user_id,
|
|
||||||
profile_id: draft.profile_id,
|
|
||||||
module_key: draft.module_key.map(str::to_string),
|
|
||||||
metadata_json: draft.metadata.to_string(),
|
|
||||||
occurred_at_micros: occurred_at_micros as i64,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_tracking_event_id(draft: &TrackingEventDraft, occurred_at_micros: i128) -> String {
|
fn build_tracking_event_id(draft: &TrackingEventDraft, occurred_at_micros: i128) -> String {
|
||||||
if draft.event_key == "daily_login"
|
if draft.event_key == "daily_login"
|
||||||
&& draft.scope_kind == RuntimeTrackingScopeKind::User
|
&& draft.scope_kind == RuntimeTrackingScopeKind::User
|
||||||
|
|||||||
Reference in New Issue
Block a user