diff --git a/.hermes/shared-memory/pitfalls.md b/.hermes/shared-memory/pitfalls.md index 4aff3fc0..806ffb49 100644 --- a/.hermes/shared-memory/pitfalls.md +++ b/.hermes/shared-memory/pitfalls.md @@ -35,10 +35,10 @@ - 现象:`external_api_call_failure` 里看到 `failureStage=request_send`、`timeout=true`、`statusCode=null`,`errorSource` 可能是 `client error (SendRequest)` 或更完整的 reqwest 底层错误链,前端只知道图片生成失败。 - 原因:`timeout=true` 来自 `reqwest::Error::is_timeout()`,不是业务代码固定写死;`SendRequest` 是 Hyper 发送请求阶段的错误来源标签,只说明请求未拿到可归类的 HTTP 响应,不会包含上游 JSON 错误体。 -- 处理:先按 `provider/failureStage/statusClass` 聚合,再用 `user_id` / `profile_id` 和 `metadata_json.userId/profileId/requestId` 定位触发者、草稿 / 作品和同一次 HTTP 请求;`request_send + timeout=true` 优先查 provider 日志的 `source_chain`、请求体大小、参考图数量、出口网络、代理/Nginx、VectorEngine 当时可用性和同一 request_id 日志。若记录有 `502` 或 `429 moderation_blocked`,按上游网关或审核失败另行处理,不要归到传输超时。 +- 处理:先按 `provider/failureStage/statusClass` 聚合,再用 `user_id` / `profile_id` 和 `metadata_json.userId/profileId/requestId` 定位触发者、草稿 / 作品和同一次 HTTP 请求;`request_send + timeout=true` 优先查 provider 日志的 `source_chain`、请求体大小、参考图数量、出口网络、代理/Nginx、VectorEngine 当时可用性和同一 request_id 日志。当前 `platform-image` 对 `request_send` 的 `timeout` / `connect` 错误最多重试 3 次,multipart `/v1/images/edits` 每次重试都必须重建 form;看到 `VectorEngine 图片请求发送失败,准备重试` 只是单次 attempt 失败,最终 `external_api_call_failure` 才代表该用户请求整体失败。若记录有 `502` 或 `429 moderation_blocked`,按上游网关或审核失败另行处理,不要归到传输超时。 - 拼图关卡资产生成按 `level_scene -> ui_spritesheet -> level_background` 顺序执行,每个资产会输出 `slot`、`asset_kind`、`elapsed_ms`;排查拼图草稿失败时优先看同一 request_id 下最后一个失败 slot。 -- 验证:`cargo check -p api-server --manifest-path server-rs/Cargo.toml`;查询 `tracking_event` 时失败记录应能看到触发者 `user_id` 和可用的 `profile_id`。 -- 关联:`server-rs/crates/api-server/src/external_api_audit.rs`、`server-rs/crates/api-server/src/openai_image_generation.rs`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 +- 验证:`cargo test -p platform-image --manifest-path server-rs/Cargo.toml vector_engine_image_edit_retries_send_timeout_once_and_succeeds`、`cargo check -p api-server --manifest-path server-rs/Cargo.toml`;查询 `tracking_event` 时失败记录应能看到触发者 `user_id` 和可用的 `profile_id`。 +- 关联:`server-rs/crates/platform-image/src/vector_engine/client.rs`、`server-rs/crates/api-server/src/external_api_audit.rs`、`server-rs/crates/api-server/src/openai_image_generation.rs`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。 ## “我的”页每日任务卡不要硬编码进度 diff --git a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md index c22cfd7c..79cc0c29 100644 --- a/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md +++ b/docs/【开发运维】本地开发验证与生产运维-2026-05-15.md @@ -1,6 +1,6 @@ # 本地开发验证与生产运维 -更新时间:`2026-05-15` +更新时间:`2026-06-05` ## 标准开发流程 @@ -69,6 +69,8 @@ spacetime sql "SELECT * FROM puzzle_gallery_card_view LIMIT 1" --serv 本地 `.env`、`.env.local` 或 `.env.secrets.local` 修改后必须重启 `api-server` 才会生效;若已经通过 `npm run dev` 启动完整联调,可在该终端输入 `rs api-server`。排查 RPG / 拼图 / 抓大鹅等 VectorEngine 生图链路时,确认 `VECTOR_ENGINE_BASE_URL`、`VECTOR_ENGINE_API_KEY` 和 `VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS` 只在本地或服务器密钥文件中配置,不能写入 Git。VectorEngine `gpt-image-2` 图片协议、URL / base64 响应解析、远端图片下载和 provider 侧结构化日志在 `server-rs/crates/platform-image`;`api-server` 只做配置、玩法编排、OSS / asset 持久化、计费和失败审计落库。开局 CG 故事板、首图、背景和图集都属于长耗时图片请求;后端默认会把 `VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS` 下限收口到 `1000000`,旧进程仍可能沿用重启前的短超时。若 VectorEngine 在 `send()` 阶段失败且日志显示 `SendRequest`,先看同一 `request_id` 的 provider 日志字段 `source`、`source_chain`、`source_chain_depth`,再查 `external_api_call_failure.metadata_json.errorSource`;当前 multipart `/v1/images/edits` 单独强制 HTTP/1.1。拼图关卡资产按 `level_scene -> ui_spritesheet -> level_background` 顺序生成,日志会带 `slot`、`asset_kind` 和 `elapsed_ms`。 +VectorEngine 图片生成 / 编辑在 `request_send` 阶段出现 `timeout` 或 `connect` 错误时,`platform-image` 会对同一请求最多发送 3 次;multipart 图片编辑每次重试都会重新构造 form,避免复用已消费的 body。日志中 `VectorEngine 图片请求发送失败,准备重试` 表示本次失败已进入下一次尝试;最终仍失败时才会写入 `external_api_call_failure` 并返回 504。排查生产失败时应同时统计 retry 前的尝试日志和最终 audit,避免把一次用户请求内的多次发送误判成多个用户请求。 + 查看本地 Rust / SpacetimeDB 日志: ```bash diff --git a/server-rs/crates/platform-image/Cargo.toml b/server-rs/crates/platform-image/Cargo.toml index f71fe161..b5a6feca 100644 --- a/server-rs/crates/platform-image/Cargo.toml +++ b/server-rs/crates/platform-image/Cargo.toml @@ -9,6 +9,6 @@ base64 = { workspace = true } image = { workspace = true, features = ["jpeg", "png", "webp"] } reqwest = { workspace = true, features = ["json", "multipart", "rustls-tls"] } serde_json = { workspace = true } -tokio = { workspace = true, features = ["time"] } +tokio = { workspace = true, features = ["io-util", "macros", "net", "time"] } tracing = { workspace = true } platform-oss = { workspace = true } diff --git a/server-rs/crates/platform-image/src/vector_engine/client.rs b/server-rs/crates/platform-image/src/vector_engine/client.rs index ee5524a0..afae18da 100644 --- a/server-rs/crates/platform-image/src/vector_engine/client.rs +++ b/server-rs/crates/platform-image/src/vector_engine/client.rs @@ -1,4 +1,7 @@ -use reqwest::header; +use reqwest::{header, multipart}; + +const VECTOR_ENGINE_SEND_MAX_ATTEMPTS: u32 = 3; +const VECTOR_ENGINE_SEND_RETRY_BASE_DELAY_MS: u64 = 500; use super::{ constants::{GPT_IMAGE_2_MODEL, VECTOR_ENGINE_PROVIDER}, @@ -50,30 +53,49 @@ pub async fn create_vector_engine_image_generation( reference_images, ); let started_at = std::time::Instant::now(); - let response = match http_client - .post(request_url.as_str()) - .header( - header::AUTHORIZATION, - format!("Bearer {}", settings.api_key), - ) - .header(header::ACCEPT, "application/json") - .header(header::CONTENT_TYPE, "application/json") - .json(&request_body) - .send() - .await - { - Ok(response) => response, - Err(error) => { - return Err(map_reqwest_error( - format!("{failure_context}:创建图片生成任务失败").as_str(), - request_url.as_str(), - "request_send", - error, - started_at.elapsed().as_millis() as u64, - Some(prompt.chars().count()), - Some(reference_images.len()), - Some(&request_body), - )); + let mut attempt = 1; + let response = loop { + match http_client + .post(request_url.as_str()) + .header( + header::AUTHORIZATION, + format!("Bearer {}", settings.api_key), + ) + .header(header::ACCEPT, "application/json") + .header(header::CONTENT_TYPE, "application/json") + .json(&request_body) + .send() + .await + { + Ok(response) => break response, + Err(error) => { + if should_retry_vector_engine_send_error(&error, attempt) { + retry_vector_engine_send_after_delay( + "generation", + request_url.as_str(), + "request_send", + attempt, + &error, + started_at.elapsed().as_millis() as u64, + Some(prompt.chars().count()), + Some(reference_images.len()), + Some(&request_body), + ) + .await; + attempt += 1; + continue; + } + return Err(map_reqwest_error( + format!("{failure_context}:创建图片生成任务失败").as_str(), + request_url.as_str(), + "request_send", + error, + started_at.elapsed().as_millis() as u64, + Some(prompt.chars().count()), + Some(reference_images.len()), + Some(&request_body), + )); + } } }; let response_status = response.status(); @@ -84,6 +106,7 @@ pub async fn create_vector_engine_image_generation( prompt_chars = prompt.chars().count(), size = %normalized_size, reference_image_count = reference_images.len(), + attempt, elapsed_ms = started_at.elapsed().as_millis() as u64, failure_context, "VectorEngine 图片生成 HTTP 返回" @@ -167,26 +190,6 @@ pub async fn create_vector_engine_image_edit_with_references( reference_images, ); - let mut form = reqwest::multipart::Form::new() - .text("model", GPT_IMAGE_2_MODEL.to_string()) - .text( - "prompt", - build_prompt_with_negative(prompt, negative_prompt), - ) - .text("n", candidate_count.clamp(1, 4).to_string()) - .text("size", normalized_size.clone()); - - for reference_image in reference_images.iter().take(5) { - let image_part = reqwest::multipart::Part::bytes(reference_image.bytes.clone()) - .file_name(reference_image.file_name.clone()) - .mime_str(reference_image.mime_type.as_str()) - .map_err(|error| PlatformImageError::InvalidRequest { - provider: VECTOR_ENGINE_PROVIDER, - message: format!("{failure_context}:构造参考图失败:{error}"), - })?; - form = form.part("image", image_part); - } - let reference_image_count = reference_images.iter().take(5).count(); let reference_image_bytes_total: usize = reference_images .iter() @@ -214,29 +217,56 @@ pub async fn create_vector_engine_image_edit_with_references( failure_context, "VectorEngine 图片编辑请求参数" ); - let response = match http_client - .post(request_url.as_str()) - .header( - header::AUTHORIZATION, - format!("Bearer {}", settings.api_key), - ) - .header(header::ACCEPT, "application/json") - .multipart(form) - .send() - .await - { - Ok(response) => response, - Err(error) => { - return Err(map_reqwest_error( - format!("{failure_context}:创建图片编辑任务失败").as_str(), - request_url.as_str(), - "request_send", - error, - started_at.elapsed().as_millis() as u64, - Some(prompt.chars().count()), - Some(reference_image_count), - Some(&request_params), - )); + let mut attempt = 1; + let response = loop { + let form = build_vector_engine_image_edit_form( + prompt, + negative_prompt, + normalized_size.as_str(), + candidate_count, + reference_images, + failure_context, + )?; + match http_client + .post(request_url.as_str()) + .header( + header::AUTHORIZATION, + format!("Bearer {}", settings.api_key), + ) + .header(header::ACCEPT, "application/json") + .multipart(form) + .send() + .await + { + Ok(response) => break response, + Err(error) => { + if should_retry_vector_engine_send_error(&error, attempt) { + retry_vector_engine_send_after_delay( + "edit", + request_url.as_str(), + "request_send", + attempt, + &error, + started_at.elapsed().as_millis() as u64, + Some(prompt.chars().count()), + Some(reference_image_count), + Some(&request_params), + ) + .await; + attempt += 1; + continue; + } + return Err(map_reqwest_error( + format!("{failure_context}:创建图片编辑任务失败").as_str(), + request_url.as_str(), + "request_send", + error, + started_at.elapsed().as_millis() as u64, + Some(prompt.chars().count()), + Some(reference_image_count), + Some(&request_params), + )); + } } }; let response_status = response.status(); @@ -249,6 +279,7 @@ pub async fn create_vector_engine_image_edit_with_references( reference_image_count, reference_image_bytes_total, request_params = %request_params, + attempt, elapsed_ms = started_at.elapsed().as_millis() as u64, failure_context, "VectorEngine 图片编辑 HTTP 返回" @@ -282,3 +313,75 @@ pub async fn create_vector_engine_image_edit_with_references( ) .await } + +fn build_vector_engine_image_edit_form( + prompt: &str, + negative_prompt: Option<&str>, + normalized_size: &str, + candidate_count: u32, + reference_images: &[ReferenceImage], + failure_context: &str, +) -> Result { + let mut form = multipart::Form::new() + .text("model", GPT_IMAGE_2_MODEL.to_string()) + .text( + "prompt", + build_prompt_with_negative(prompt, negative_prompt), + ) + .text("n", candidate_count.clamp(1, 4).to_string()) + .text("size", normalized_size.to_string()); + + for reference_image in reference_images.iter().take(5) { + let image_part = multipart::Part::bytes(reference_image.bytes.clone()) + .file_name(reference_image.file_name.clone()) + .mime_str(reference_image.mime_type.as_str()) + .map_err(|error| PlatformImageError::InvalidRequest { + provider: VECTOR_ENGINE_PROVIDER, + message: format!("{failure_context}:构造参考图失败:{error}"), + })?; + form = form.part("image", image_part); + } + + Ok(form) +} + +fn should_retry_vector_engine_send_error(error: &reqwest::Error, attempt: u32) -> bool { + attempt < VECTOR_ENGINE_SEND_MAX_ATTEMPTS && (error.is_timeout() || error.is_connect()) +} + +async fn retry_vector_engine_send_after_delay( + request_kind: &'static str, + request_url: &str, + failure_stage: &'static str, + attempt: u32, + error: &reqwest::Error, + elapsed_ms: u64, + prompt_chars: Option, + reference_image_count: Option, + request_params: Option<&serde_json::Value>, +) { + let delay_ms = VECTOR_ENGINE_SEND_RETRY_BASE_DELAY_MS * u64::from(attempt); + tracing::warn!( + provider = VECTOR_ENGINE_PROVIDER, + endpoint = %request_url, + request_kind, + failure_stage, + attempt, + max_attempts = VECTOR_ENGINE_SEND_MAX_ATTEMPTS, + retry_delay_ms = delay_ms, + timeout = error.is_timeout(), + connect = error.is_connect(), + request = error.is_request(), + body = error.is_body(), + status = error.status().map(|status| status.as_u16()).unwrap_or_default(), + error = %error, + elapsed_ms, + prompt_chars, + reference_image_count, + request_params = %request_params + .map(|value| value.to_string()) + .unwrap_or_default(), + "VectorEngine 图片请求发送失败,准备重试" + ); + tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; +} diff --git a/server-rs/crates/platform-image/tests/vector_engine.rs b/server-rs/crates/platform-image/tests/vector_engine.rs index e9bfb1e0..c53d63c2 100644 --- a/server-rs/crates/platform-image/tests/vector_engine.rs +++ b/server-rs/crates/platform-image/tests/vector_engine.rs @@ -1,8 +1,20 @@ use platform_image::vector_engine::{ - GPT_IMAGE_2_MODEL, VECTOR_ENGINE_PROVIDER, VectorEngineImageSettings, - build_vector_engine_image_request_body, vector_engine_images_edit_url, + GPT_IMAGE_2_MODEL, ReferenceImage, VECTOR_ENGINE_PROVIDER, VectorEngineImageSettings, + build_vector_engine_image_http_client, build_vector_engine_image_request_body, + create_vector_engine_image_edit, vector_engine_images_edit_url, vector_engine_images_generation_url, }; +use std::{ + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, + time::Duration, +}; +use tokio::{ + io::{AsyncReadExt, AsyncWriteExt}, + net::TcpListener, +}; #[test] fn vector_engine_module_exposes_provider_protocol_helpers() { @@ -30,3 +42,70 @@ fn vector_engine_module_exposes_provider_protocol_helpers() { "https://vector.example/v1/images/edits" ); } + +#[tokio::test] +async fn vector_engine_image_edit_retries_send_timeout_once_and_succeeds() { + let listener = TcpListener::bind("127.0.0.1:0") + .await + .expect("mock server should bind"); + let server_addr = listener + .local_addr() + .expect("mock server address should be readable"); + let request_count = Arc::new(AtomicUsize::new(0)); + let request_count_for_server = Arc::clone(&request_count); + + let server = tokio::spawn(async move { + loop { + let Ok((mut stream, _)) = listener.accept().await else { + break; + }; + let request_index = request_count_for_server.fetch_add(1, Ordering::SeqCst); + tokio::spawn(async move { + let mut buffer = [0_u8; 4096]; + let _ = stream.read(&mut buffer).await; + if request_index == 0 { + tokio::time::sleep(Duration::from_millis(120)).await; + return; + } + + let body = r#"{"data":[{"b64_json":"iVBORw0KGgpyZXN0"}]}"#; + let response = format!( + "HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\n\r\n{}", + body.len(), + body + ); + let _ = stream.write_all(response.as_bytes()).await; + }); + } + }); + + let settings = VectorEngineImageSettings { + base_url: format!("http://{server_addr}/v1"), + api_key: "test-key".to_string(), + request_timeout_ms: 40, + }; + let http_client = + build_vector_engine_image_http_client(&settings).expect("client should build"); + let reference_image = ReferenceImage { + bytes: b"reference".to_vec(), + mime_type: "image/png".to_string(), + file_name: "reference.png".to_string(), + }; + + let generated = create_vector_engine_image_edit( + &http_client, + &settings, + "测试提示词", + None, + "1024x1024", + &reference_image, + "测试 VectorEngine 图片编辑失败", + ) + .await + .expect("second attempt should return generated image"); + + assert_eq!(generated.images.len(), 1); + assert_eq!(generated.images[0].mime_type, "image/png"); + assert_eq!(request_count.load(Ordering::SeqCst), 2); + server.abort(); +}