fix: retry VectorEngine image send timeouts

This commit is contained in:
kdletters
2026-06-05 11:37:53 +08:00
parent 0b07161034
commit 9ab353926e
5 changed files with 259 additions and 75 deletions

View File

@@ -35,10 +35,10 @@
- 现象:`external_api_call_failure` 里看到 `failureStage=request_send``timeout=true``statusCode=null``errorSource` 可能是 `client error (SendRequest)` 或更完整的 reqwest 底层错误链,前端只知道图片生成失败。
- 原因:`timeout=true` 来自 `reqwest::Error::is_timeout()`,不是业务代码固定写死;`SendRequest` 是 Hyper 发送请求阶段的错误来源标签,只说明请求未拿到可归类的 HTTP 响应,不会包含上游 JSON 错误体。
- 处理:先按 `provider/failureStage/statusClass` 聚合,再用 `user_id` / `profile_id``metadata_json.userId/profileId/requestId` 定位触发者、草稿 / 作品和同一次 HTTP 请求;`request_send + timeout=true` 优先查 provider 日志的 `source_chain`、请求体大小、参考图数量、出口网络、代理/Nginx、VectorEngine 当时可用性和同一 request_id 日志。若记录有 `502``429 moderation_blocked`,按上游网关或审核失败另行处理,不要归到传输超时。
- 处理:先按 `provider/failureStage/statusClass` 聚合,再用 `user_id` / `profile_id``metadata_json.userId/profileId/requestId` 定位触发者、草稿 / 作品和同一次 HTTP 请求;`request_send + timeout=true` 优先查 provider 日志的 `source_chain`、请求体大小、参考图数量、出口网络、代理/Nginx、VectorEngine 当时可用性和同一 request_id 日志。当前 `platform-image``request_send``timeout` / `connect` 错误最多重试 3 次multipart `/v1/images/edits` 每次重试都必须重建 form看到 `VectorEngine 图片请求发送失败,准备重试` 只是单次 attempt 失败,最终 `external_api_call_failure` 才代表该用户请求整体失败。若记录有 `502``429 moderation_blocked`,按上游网关或审核失败另行处理,不要归到传输超时。
- 拼图关卡资产生成按 `level_scene -> ui_spritesheet -> level_background` 顺序执行,每个资产会输出 `slot``asset_kind``elapsed_ms`;排查拼图草稿失败时优先看同一 request_id 下最后一个失败 slot。
- 验证:`cargo check -p api-server --manifest-path server-rs/Cargo.toml`;查询 `tracking_event` 时失败记录应能看到触发者 `user_id` 和可用的 `profile_id`
- 关联:`server-rs/crates/api-server/src/external_api_audit.rs``server-rs/crates/api-server/src/openai_image_generation.rs``docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`
- 验证:`cargo test -p platform-image --manifest-path server-rs/Cargo.toml vector_engine_image_edit_retries_send_timeout_once_and_succeeds``cargo check -p api-server --manifest-path server-rs/Cargo.toml`;查询 `tracking_event` 时失败记录应能看到触发者 `user_id` 和可用的 `profile_id`
- 关联:`server-rs/crates/platform-image/src/vector_engine/client.rs``server-rs/crates/api-server/src/external_api_audit.rs``server-rs/crates/api-server/src/openai_image_generation.rs``docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`
## “我的”页每日任务卡不要硬编码进度

View File

@@ -1,6 +1,6 @@
# 本地开发验证与生产运维
更新时间:`2026-05-15`
更新时间:`2026-06-05`
## 标准开发流程
@@ -69,6 +69,8 @@ spacetime sql <database> "SELECT * FROM puzzle_gallery_card_view LIMIT 1" --serv
本地 `.env``.env.local``.env.secrets.local` 修改后必须重启 `api-server` 才会生效;若已经通过 `npm run dev` 启动完整联调,可在该终端输入 `rs api-server`。排查 RPG / 拼图 / 抓大鹅等 VectorEngine 生图链路时,确认 `VECTOR_ENGINE_BASE_URL``VECTOR_ENGINE_API_KEY``VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS` 只在本地或服务器密钥文件中配置,不能写入 Git。VectorEngine `gpt-image-2` 图片协议、URL / base64 响应解析、远端图片下载和 provider 侧结构化日志在 `server-rs/crates/platform-image``api-server` 只做配置、玩法编排、OSS / asset 持久化、计费和失败审计落库。开局 CG 故事板、首图、背景和图集都属于长耗时图片请求;后端默认会把 `VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS` 下限收口到 `1000000`,旧进程仍可能沿用重启前的短超时。若 VectorEngine 在 `send()` 阶段失败且日志显示 `SendRequest`,先看同一 `request_id` 的 provider 日志字段 `source``source_chain``source_chain_depth`,再查 `external_api_call_failure.metadata_json.errorSource`;当前 multipart `/v1/images/edits` 单独强制 HTTP/1.1。拼图关卡资产按 `level_scene -> ui_spritesheet -> level_background` 顺序生成,日志会带 `slot``asset_kind``elapsed_ms`
VectorEngine 图片生成 / 编辑在 `request_send` 阶段出现 `timeout``connect` 错误时,`platform-image` 会对同一请求最多发送 3 次multipart 图片编辑每次重试都会重新构造 form避免复用已消费的 body。日志中 `VectorEngine 图片请求发送失败,准备重试` 表示本次失败已进入下一次尝试;最终仍失败时才会写入 `external_api_call_failure` 并返回 504。排查生产失败时应同时统计 retry 前的尝试日志和最终 audit避免把一次用户请求内的多次发送误判成多个用户请求。
查看本地 Rust / SpacetimeDB 日志:
```bash

View File

@@ -9,6 +9,6 @@ base64 = { workspace = true }
image = { workspace = true, features = ["jpeg", "png", "webp"] }
reqwest = { workspace = true, features = ["json", "multipart", "rustls-tls"] }
serde_json = { workspace = true }
tokio = { workspace = true, features = ["time"] }
tokio = { workspace = true, features = ["io-util", "macros", "net", "time"] }
tracing = { workspace = true }
platform-oss = { workspace = true }

View File

@@ -1,4 +1,7 @@
use reqwest::header;
use reqwest::{header, multipart};
const VECTOR_ENGINE_SEND_MAX_ATTEMPTS: u32 = 3;
const VECTOR_ENGINE_SEND_RETRY_BASE_DELAY_MS: u64 = 500;
use super::{
constants::{GPT_IMAGE_2_MODEL, VECTOR_ENGINE_PROVIDER},
@@ -50,30 +53,49 @@ pub async fn create_vector_engine_image_generation(
reference_images,
);
let started_at = std::time::Instant::now();
let response = match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
)
.header(header::ACCEPT, "application/json")
.header(header::CONTENT_TYPE, "application/json")
.json(&request_body)
.send()
.await
{
Ok(response) => response,
Err(error) => {
return Err(map_reqwest_error(
format!("{failure_context}:创建图片生成任务失败").as_str(),
request_url.as_str(),
"request_send",
error,
started_at.elapsed().as_millis() as u64,
Some(prompt.chars().count()),
Some(reference_images.len()),
Some(&request_body),
));
let mut attempt = 1;
let response = loop {
match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
)
.header(header::ACCEPT, "application/json")
.header(header::CONTENT_TYPE, "application/json")
.json(&request_body)
.send()
.await
{
Ok(response) => break response,
Err(error) => {
if should_retry_vector_engine_send_error(&error, attempt) {
retry_vector_engine_send_after_delay(
"generation",
request_url.as_str(),
"request_send",
attempt,
&error,
started_at.elapsed().as_millis() as u64,
Some(prompt.chars().count()),
Some(reference_images.len()),
Some(&request_body),
)
.await;
attempt += 1;
continue;
}
return Err(map_reqwest_error(
format!("{failure_context}:创建图片生成任务失败").as_str(),
request_url.as_str(),
"request_send",
error,
started_at.elapsed().as_millis() as u64,
Some(prompt.chars().count()),
Some(reference_images.len()),
Some(&request_body),
));
}
}
};
let response_status = response.status();
@@ -84,6 +106,7 @@ pub async fn create_vector_engine_image_generation(
prompt_chars = prompt.chars().count(),
size = %normalized_size,
reference_image_count = reference_images.len(),
attempt,
elapsed_ms = started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片生成 HTTP 返回"
@@ -167,26 +190,6 @@ pub async fn create_vector_engine_image_edit_with_references(
reference_images,
);
let mut form = reqwest::multipart::Form::new()
.text("model", GPT_IMAGE_2_MODEL.to_string())
.text(
"prompt",
build_prompt_with_negative(prompt, negative_prompt),
)
.text("n", candidate_count.clamp(1, 4).to_string())
.text("size", normalized_size.clone());
for reference_image in reference_images.iter().take(5) {
let image_part = reqwest::multipart::Part::bytes(reference_image.bytes.clone())
.file_name(reference_image.file_name.clone())
.mime_str(reference_image.mime_type.as_str())
.map_err(|error| PlatformImageError::InvalidRequest {
provider: VECTOR_ENGINE_PROVIDER,
message: format!("{failure_context}:构造参考图失败:{error}"),
})?;
form = form.part("image", image_part);
}
let reference_image_count = reference_images.iter().take(5).count();
let reference_image_bytes_total: usize = reference_images
.iter()
@@ -214,29 +217,56 @@ pub async fn create_vector_engine_image_edit_with_references(
failure_context,
"VectorEngine 图片编辑请求参数"
);
let response = match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
)
.header(header::ACCEPT, "application/json")
.multipart(form)
.send()
.await
{
Ok(response) => response,
Err(error) => {
return Err(map_reqwest_error(
format!("{failure_context}:创建图片编辑任务失败").as_str(),
request_url.as_str(),
"request_send",
error,
started_at.elapsed().as_millis() as u64,
Some(prompt.chars().count()),
Some(reference_image_count),
Some(&request_params),
));
let mut attempt = 1;
let response = loop {
let form = build_vector_engine_image_edit_form(
prompt,
negative_prompt,
normalized_size.as_str(),
candidate_count,
reference_images,
failure_context,
)?;
match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
)
.header(header::ACCEPT, "application/json")
.multipart(form)
.send()
.await
{
Ok(response) => break response,
Err(error) => {
if should_retry_vector_engine_send_error(&error, attempt) {
retry_vector_engine_send_after_delay(
"edit",
request_url.as_str(),
"request_send",
attempt,
&error,
started_at.elapsed().as_millis() as u64,
Some(prompt.chars().count()),
Some(reference_image_count),
Some(&request_params),
)
.await;
attempt += 1;
continue;
}
return Err(map_reqwest_error(
format!("{failure_context}:创建图片编辑任务失败").as_str(),
request_url.as_str(),
"request_send",
error,
started_at.elapsed().as_millis() as u64,
Some(prompt.chars().count()),
Some(reference_image_count),
Some(&request_params),
));
}
}
};
let response_status = response.status();
@@ -249,6 +279,7 @@ pub async fn create_vector_engine_image_edit_with_references(
reference_image_count,
reference_image_bytes_total,
request_params = %request_params,
attempt,
elapsed_ms = started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片编辑 HTTP 返回"
@@ -282,3 +313,75 @@ pub async fn create_vector_engine_image_edit_with_references(
)
.await
}
fn build_vector_engine_image_edit_form(
prompt: &str,
negative_prompt: Option<&str>,
normalized_size: &str,
candidate_count: u32,
reference_images: &[ReferenceImage],
failure_context: &str,
) -> Result<multipart::Form, PlatformImageError> {
let mut form = multipart::Form::new()
.text("model", GPT_IMAGE_2_MODEL.to_string())
.text(
"prompt",
build_prompt_with_negative(prompt, negative_prompt),
)
.text("n", candidate_count.clamp(1, 4).to_string())
.text("size", normalized_size.to_string());
for reference_image in reference_images.iter().take(5) {
let image_part = multipart::Part::bytes(reference_image.bytes.clone())
.file_name(reference_image.file_name.clone())
.mime_str(reference_image.mime_type.as_str())
.map_err(|error| PlatformImageError::InvalidRequest {
provider: VECTOR_ENGINE_PROVIDER,
message: format!("{failure_context}:构造参考图失败:{error}"),
})?;
form = form.part("image", image_part);
}
Ok(form)
}
fn should_retry_vector_engine_send_error(error: &reqwest::Error, attempt: u32) -> bool {
attempt < VECTOR_ENGINE_SEND_MAX_ATTEMPTS && (error.is_timeout() || error.is_connect())
}
async fn retry_vector_engine_send_after_delay(
request_kind: &'static str,
request_url: &str,
failure_stage: &'static str,
attempt: u32,
error: &reqwest::Error,
elapsed_ms: u64,
prompt_chars: Option<usize>,
reference_image_count: Option<usize>,
request_params: Option<&serde_json::Value>,
) {
let delay_ms = VECTOR_ENGINE_SEND_RETRY_BASE_DELAY_MS * u64::from(attempt);
tracing::warn!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
request_kind,
failure_stage,
attempt,
max_attempts = VECTOR_ENGINE_SEND_MAX_ATTEMPTS,
retry_delay_ms = delay_ms,
timeout = error.is_timeout(),
connect = error.is_connect(),
request = error.is_request(),
body = error.is_body(),
status = error.status().map(|status| status.as_u16()).unwrap_or_default(),
error = %error,
elapsed_ms,
prompt_chars,
reference_image_count,
request_params = %request_params
.map(|value| value.to_string())
.unwrap_or_default(),
"VectorEngine 图片请求发送失败,准备重试"
);
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
}

View File

@@ -1,8 +1,20 @@
use platform_image::vector_engine::{
GPT_IMAGE_2_MODEL, VECTOR_ENGINE_PROVIDER, VectorEngineImageSettings,
build_vector_engine_image_request_body, vector_engine_images_edit_url,
GPT_IMAGE_2_MODEL, ReferenceImage, VECTOR_ENGINE_PROVIDER, VectorEngineImageSettings,
build_vector_engine_image_http_client, build_vector_engine_image_request_body,
create_vector_engine_image_edit, vector_engine_images_edit_url,
vector_engine_images_generation_url,
};
use std::{
sync::{
Arc,
atomic::{AtomicUsize, Ordering},
},
time::Duration,
};
use tokio::{
io::{AsyncReadExt, AsyncWriteExt},
net::TcpListener,
};
#[test]
fn vector_engine_module_exposes_provider_protocol_helpers() {
@@ -30,3 +42,70 @@ fn vector_engine_module_exposes_provider_protocol_helpers() {
"https://vector.example/v1/images/edits"
);
}
#[tokio::test]
async fn vector_engine_image_edit_retries_send_timeout_once_and_succeeds() {
let listener = TcpListener::bind("127.0.0.1:0")
.await
.expect("mock server should bind");
let server_addr = listener
.local_addr()
.expect("mock server address should be readable");
let request_count = Arc::new(AtomicUsize::new(0));
let request_count_for_server = Arc::clone(&request_count);
let server = tokio::spawn(async move {
loop {
let Ok((mut stream, _)) = listener.accept().await else {
break;
};
let request_index = request_count_for_server.fetch_add(1, Ordering::SeqCst);
tokio::spawn(async move {
let mut buffer = [0_u8; 4096];
let _ = stream.read(&mut buffer).await;
if request_index == 0 {
tokio::time::sleep(Duration::from_millis(120)).await;
return;
}
let body = r#"{"data":[{"b64_json":"iVBORw0KGgpyZXN0"}]}"#;
let response = format!(
"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\n\r\n{}",
body.len(),
body
);
let _ = stream.write_all(response.as_bytes()).await;
});
}
});
let settings = VectorEngineImageSettings {
base_url: format!("http://{server_addr}/v1"),
api_key: "test-key".to_string(),
request_timeout_ms: 40,
};
let http_client =
build_vector_engine_image_http_client(&settings).expect("client should build");
let reference_image = ReferenceImage {
bytes: b"reference".to_vec(),
mime_type: "image/png".to_string(),
file_name: "reference.png".to_string(),
};
let generated = create_vector_engine_image_edit(
&http_client,
&settings,
"测试提示词",
None,
"1024x1024",
&reference_image,
"测试 VectorEngine 图片编辑失败",
)
.await
.expect("second attempt should return generated image");
assert_eq!(generated.images.len(), 1);
assert_eq!(generated.images[0].mime_type, "image/png");
assert_eq!(request_count.load(Ordering::SeqCst), 2);
server.abort();
}