fix: retry VectorEngine image send timeouts
This commit is contained in:
@@ -35,10 +35,10 @@
|
||||
|
||||
- 现象:`external_api_call_failure` 里看到 `failureStage=request_send`、`timeout=true`、`statusCode=null`,`errorSource` 可能是 `client error (SendRequest)` 或更完整的 reqwest 底层错误链,前端只知道图片生成失败。
|
||||
- 原因:`timeout=true` 来自 `reqwest::Error::is_timeout()`,不是业务代码固定写死;`SendRequest` 是 Hyper 发送请求阶段的错误来源标签,只说明请求未拿到可归类的 HTTP 响应,不会包含上游 JSON 错误体。
|
||||
- 处理:先按 `provider/failureStage/statusClass` 聚合,再用 `user_id` / `profile_id` 和 `metadata_json.userId/profileId/requestId` 定位触发者、草稿 / 作品和同一次 HTTP 请求;`request_send + timeout=true` 优先查 provider 日志的 `source_chain`、请求体大小、参考图数量、出口网络、代理/Nginx、VectorEngine 当时可用性和同一 request_id 日志。若记录有 `502` 或 `429 moderation_blocked`,按上游网关或审核失败另行处理,不要归到传输超时。
|
||||
- 处理:先按 `provider/failureStage/statusClass` 聚合,再用 `user_id` / `profile_id` 和 `metadata_json.userId/profileId/requestId` 定位触发者、草稿 / 作品和同一次 HTTP 请求;`request_send + timeout=true` 优先查 provider 日志的 `source_chain`、请求体大小、参考图数量、出口网络、代理/Nginx、VectorEngine 当时可用性和同一 request_id 日志。当前 `platform-image` 对 `request_send` 的 `timeout` / `connect` 错误最多重试 3 次,multipart `/v1/images/edits` 每次重试都必须重建 form;看到 `VectorEngine 图片请求发送失败,准备重试` 只是单次 attempt 失败,最终 `external_api_call_failure` 才代表该用户请求整体失败。若记录有 `502` 或 `429 moderation_blocked`,按上游网关或审核失败另行处理,不要归到传输超时。
|
||||
- 拼图关卡资产生成按 `level_scene -> ui_spritesheet -> level_background` 顺序执行,每个资产会输出 `slot`、`asset_kind`、`elapsed_ms`;排查拼图草稿失败时优先看同一 request_id 下最后一个失败 slot。
|
||||
- 验证:`cargo check -p api-server --manifest-path server-rs/Cargo.toml`;查询 `tracking_event` 时失败记录应能看到触发者 `user_id` 和可用的 `profile_id`。
|
||||
- 关联:`server-rs/crates/api-server/src/external_api_audit.rs`、`server-rs/crates/api-server/src/openai_image_generation.rs`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
||||
- 验证:`cargo test -p platform-image --manifest-path server-rs/Cargo.toml vector_engine_image_edit_retries_send_timeout_once_and_succeeds`、`cargo check -p api-server --manifest-path server-rs/Cargo.toml`;查询 `tracking_event` 时失败记录应能看到触发者 `user_id` 和可用的 `profile_id`。
|
||||
- 关联:`server-rs/crates/platform-image/src/vector_engine/client.rs`、`server-rs/crates/api-server/src/external_api_audit.rs`、`server-rs/crates/api-server/src/openai_image_generation.rs`、`docs/【开发运维】本地开发验证与生产运维-2026-05-15.md`。
|
||||
|
||||
## “我的”页每日任务卡不要硬编码进度
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# 本地开发验证与生产运维
|
||||
|
||||
更新时间:`2026-05-15`
|
||||
更新时间:`2026-06-05`
|
||||
|
||||
## 标准开发流程
|
||||
|
||||
@@ -69,6 +69,8 @@ spacetime sql <database> "SELECT * FROM puzzle_gallery_card_view LIMIT 1" --serv
|
||||
|
||||
本地 `.env`、`.env.local` 或 `.env.secrets.local` 修改后必须重启 `api-server` 才会生效;若已经通过 `npm run dev` 启动完整联调,可在该终端输入 `rs api-server`。排查 RPG / 拼图 / 抓大鹅等 VectorEngine 生图链路时,确认 `VECTOR_ENGINE_BASE_URL`、`VECTOR_ENGINE_API_KEY` 和 `VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS` 只在本地或服务器密钥文件中配置,不能写入 Git。VectorEngine `gpt-image-2` 图片协议、URL / base64 响应解析、远端图片下载和 provider 侧结构化日志在 `server-rs/crates/platform-image`;`api-server` 只做配置、玩法编排、OSS / asset 持久化、计费和失败审计落库。开局 CG 故事板、首图、背景和图集都属于长耗时图片请求;后端默认会把 `VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS` 下限收口到 `1000000`,旧进程仍可能沿用重启前的短超时。若 VectorEngine 在 `send()` 阶段失败且日志显示 `SendRequest`,先看同一 `request_id` 的 provider 日志字段 `source`、`source_chain`、`source_chain_depth`,再查 `external_api_call_failure.metadata_json.errorSource`;当前 multipart `/v1/images/edits` 单独强制 HTTP/1.1。拼图关卡资产按 `level_scene -> ui_spritesheet -> level_background` 顺序生成,日志会带 `slot`、`asset_kind` 和 `elapsed_ms`。
|
||||
|
||||
VectorEngine 图片生成 / 编辑在 `request_send` 阶段出现 `timeout` 或 `connect` 错误时,`platform-image` 会对同一请求最多发送 3 次;multipart 图片编辑每次重试都会重新构造 form,避免复用已消费的 body。日志中 `VectorEngine 图片请求发送失败,准备重试` 表示本次失败已进入下一次尝试;最终仍失败时才会写入 `external_api_call_failure` 并返回 504。排查生产失败时应同时统计 retry 前的尝试日志和最终 audit,避免把一次用户请求内的多次发送误判成多个用户请求。
|
||||
|
||||
查看本地 Rust / SpacetimeDB 日志:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -9,6 +9,6 @@ base64 = { workspace = true }
|
||||
image = { workspace = true, features = ["jpeg", "png", "webp"] }
|
||||
reqwest = { workspace = true, features = ["json", "multipart", "rustls-tls"] }
|
||||
serde_json = { workspace = true }
|
||||
tokio = { workspace = true, features = ["time"] }
|
||||
tokio = { workspace = true, features = ["io-util", "macros", "net", "time"] }
|
||||
tracing = { workspace = true }
|
||||
platform-oss = { workspace = true }
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
use reqwest::header;
|
||||
use reqwest::{header, multipart};
|
||||
|
||||
const VECTOR_ENGINE_SEND_MAX_ATTEMPTS: u32 = 3;
|
||||
const VECTOR_ENGINE_SEND_RETRY_BASE_DELAY_MS: u64 = 500;
|
||||
|
||||
use super::{
|
||||
constants::{GPT_IMAGE_2_MODEL, VECTOR_ENGINE_PROVIDER},
|
||||
@@ -50,30 +53,49 @@ pub async fn create_vector_engine_image_generation(
|
||||
reference_images,
|
||||
);
|
||||
let started_at = std::time::Instant::now();
|
||||
let response = match http_client
|
||||
.post(request_url.as_str())
|
||||
.header(
|
||||
header::AUTHORIZATION,
|
||||
format!("Bearer {}", settings.api_key),
|
||||
)
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.json(&request_body)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => response,
|
||||
Err(error) => {
|
||||
return Err(map_reqwest_error(
|
||||
format!("{failure_context}:创建图片生成任务失败").as_str(),
|
||||
request_url.as_str(),
|
||||
"request_send",
|
||||
error,
|
||||
started_at.elapsed().as_millis() as u64,
|
||||
Some(prompt.chars().count()),
|
||||
Some(reference_images.len()),
|
||||
Some(&request_body),
|
||||
));
|
||||
let mut attempt = 1;
|
||||
let response = loop {
|
||||
match http_client
|
||||
.post(request_url.as_str())
|
||||
.header(
|
||||
header::AUTHORIZATION,
|
||||
format!("Bearer {}", settings.api_key),
|
||||
)
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.json(&request_body)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => break response,
|
||||
Err(error) => {
|
||||
if should_retry_vector_engine_send_error(&error, attempt) {
|
||||
retry_vector_engine_send_after_delay(
|
||||
"generation",
|
||||
request_url.as_str(),
|
||||
"request_send",
|
||||
attempt,
|
||||
&error,
|
||||
started_at.elapsed().as_millis() as u64,
|
||||
Some(prompt.chars().count()),
|
||||
Some(reference_images.len()),
|
||||
Some(&request_body),
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
continue;
|
||||
}
|
||||
return Err(map_reqwest_error(
|
||||
format!("{failure_context}:创建图片生成任务失败").as_str(),
|
||||
request_url.as_str(),
|
||||
"request_send",
|
||||
error,
|
||||
started_at.elapsed().as_millis() as u64,
|
||||
Some(prompt.chars().count()),
|
||||
Some(reference_images.len()),
|
||||
Some(&request_body),
|
||||
));
|
||||
}
|
||||
}
|
||||
};
|
||||
let response_status = response.status();
|
||||
@@ -84,6 +106,7 @@ pub async fn create_vector_engine_image_generation(
|
||||
prompt_chars = prompt.chars().count(),
|
||||
size = %normalized_size,
|
||||
reference_image_count = reference_images.len(),
|
||||
attempt,
|
||||
elapsed_ms = started_at.elapsed().as_millis() as u64,
|
||||
failure_context,
|
||||
"VectorEngine 图片生成 HTTP 返回"
|
||||
@@ -167,26 +190,6 @@ pub async fn create_vector_engine_image_edit_with_references(
|
||||
reference_images,
|
||||
);
|
||||
|
||||
let mut form = reqwest::multipart::Form::new()
|
||||
.text("model", GPT_IMAGE_2_MODEL.to_string())
|
||||
.text(
|
||||
"prompt",
|
||||
build_prompt_with_negative(prompt, negative_prompt),
|
||||
)
|
||||
.text("n", candidate_count.clamp(1, 4).to_string())
|
||||
.text("size", normalized_size.clone());
|
||||
|
||||
for reference_image in reference_images.iter().take(5) {
|
||||
let image_part = reqwest::multipart::Part::bytes(reference_image.bytes.clone())
|
||||
.file_name(reference_image.file_name.clone())
|
||||
.mime_str(reference_image.mime_type.as_str())
|
||||
.map_err(|error| PlatformImageError::InvalidRequest {
|
||||
provider: VECTOR_ENGINE_PROVIDER,
|
||||
message: format!("{failure_context}:构造参考图失败:{error}"),
|
||||
})?;
|
||||
form = form.part("image", image_part);
|
||||
}
|
||||
|
||||
let reference_image_count = reference_images.iter().take(5).count();
|
||||
let reference_image_bytes_total: usize = reference_images
|
||||
.iter()
|
||||
@@ -214,29 +217,56 @@ pub async fn create_vector_engine_image_edit_with_references(
|
||||
failure_context,
|
||||
"VectorEngine 图片编辑请求参数"
|
||||
);
|
||||
let response = match http_client
|
||||
.post(request_url.as_str())
|
||||
.header(
|
||||
header::AUTHORIZATION,
|
||||
format!("Bearer {}", settings.api_key),
|
||||
)
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => response,
|
||||
Err(error) => {
|
||||
return Err(map_reqwest_error(
|
||||
format!("{failure_context}:创建图片编辑任务失败").as_str(),
|
||||
request_url.as_str(),
|
||||
"request_send",
|
||||
error,
|
||||
started_at.elapsed().as_millis() as u64,
|
||||
Some(prompt.chars().count()),
|
||||
Some(reference_image_count),
|
||||
Some(&request_params),
|
||||
));
|
||||
let mut attempt = 1;
|
||||
let response = loop {
|
||||
let form = build_vector_engine_image_edit_form(
|
||||
prompt,
|
||||
negative_prompt,
|
||||
normalized_size.as_str(),
|
||||
candidate_count,
|
||||
reference_images,
|
||||
failure_context,
|
||||
)?;
|
||||
match http_client
|
||||
.post(request_url.as_str())
|
||||
.header(
|
||||
header::AUTHORIZATION,
|
||||
format!("Bearer {}", settings.api_key),
|
||||
)
|
||||
.header(header::ACCEPT, "application/json")
|
||||
.multipart(form)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(response) => break response,
|
||||
Err(error) => {
|
||||
if should_retry_vector_engine_send_error(&error, attempt) {
|
||||
retry_vector_engine_send_after_delay(
|
||||
"edit",
|
||||
request_url.as_str(),
|
||||
"request_send",
|
||||
attempt,
|
||||
&error,
|
||||
started_at.elapsed().as_millis() as u64,
|
||||
Some(prompt.chars().count()),
|
||||
Some(reference_image_count),
|
||||
Some(&request_params),
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
continue;
|
||||
}
|
||||
return Err(map_reqwest_error(
|
||||
format!("{failure_context}:创建图片编辑任务失败").as_str(),
|
||||
request_url.as_str(),
|
||||
"request_send",
|
||||
error,
|
||||
started_at.elapsed().as_millis() as u64,
|
||||
Some(prompt.chars().count()),
|
||||
Some(reference_image_count),
|
||||
Some(&request_params),
|
||||
));
|
||||
}
|
||||
}
|
||||
};
|
||||
let response_status = response.status();
|
||||
@@ -249,6 +279,7 @@ pub async fn create_vector_engine_image_edit_with_references(
|
||||
reference_image_count,
|
||||
reference_image_bytes_total,
|
||||
request_params = %request_params,
|
||||
attempt,
|
||||
elapsed_ms = started_at.elapsed().as_millis() as u64,
|
||||
failure_context,
|
||||
"VectorEngine 图片编辑 HTTP 返回"
|
||||
@@ -282,3 +313,75 @@ pub async fn create_vector_engine_image_edit_with_references(
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
fn build_vector_engine_image_edit_form(
|
||||
prompt: &str,
|
||||
negative_prompt: Option<&str>,
|
||||
normalized_size: &str,
|
||||
candidate_count: u32,
|
||||
reference_images: &[ReferenceImage],
|
||||
failure_context: &str,
|
||||
) -> Result<multipart::Form, PlatformImageError> {
|
||||
let mut form = multipart::Form::new()
|
||||
.text("model", GPT_IMAGE_2_MODEL.to_string())
|
||||
.text(
|
||||
"prompt",
|
||||
build_prompt_with_negative(prompt, negative_prompt),
|
||||
)
|
||||
.text("n", candidate_count.clamp(1, 4).to_string())
|
||||
.text("size", normalized_size.to_string());
|
||||
|
||||
for reference_image in reference_images.iter().take(5) {
|
||||
let image_part = multipart::Part::bytes(reference_image.bytes.clone())
|
||||
.file_name(reference_image.file_name.clone())
|
||||
.mime_str(reference_image.mime_type.as_str())
|
||||
.map_err(|error| PlatformImageError::InvalidRequest {
|
||||
provider: VECTOR_ENGINE_PROVIDER,
|
||||
message: format!("{failure_context}:构造参考图失败:{error}"),
|
||||
})?;
|
||||
form = form.part("image", image_part);
|
||||
}
|
||||
|
||||
Ok(form)
|
||||
}
|
||||
|
||||
fn should_retry_vector_engine_send_error(error: &reqwest::Error, attempt: u32) -> bool {
|
||||
attempt < VECTOR_ENGINE_SEND_MAX_ATTEMPTS && (error.is_timeout() || error.is_connect())
|
||||
}
|
||||
|
||||
async fn retry_vector_engine_send_after_delay(
|
||||
request_kind: &'static str,
|
||||
request_url: &str,
|
||||
failure_stage: &'static str,
|
||||
attempt: u32,
|
||||
error: &reqwest::Error,
|
||||
elapsed_ms: u64,
|
||||
prompt_chars: Option<usize>,
|
||||
reference_image_count: Option<usize>,
|
||||
request_params: Option<&serde_json::Value>,
|
||||
) {
|
||||
let delay_ms = VECTOR_ENGINE_SEND_RETRY_BASE_DELAY_MS * u64::from(attempt);
|
||||
tracing::warn!(
|
||||
provider = VECTOR_ENGINE_PROVIDER,
|
||||
endpoint = %request_url,
|
||||
request_kind,
|
||||
failure_stage,
|
||||
attempt,
|
||||
max_attempts = VECTOR_ENGINE_SEND_MAX_ATTEMPTS,
|
||||
retry_delay_ms = delay_ms,
|
||||
timeout = error.is_timeout(),
|
||||
connect = error.is_connect(),
|
||||
request = error.is_request(),
|
||||
body = error.is_body(),
|
||||
status = error.status().map(|status| status.as_u16()).unwrap_or_default(),
|
||||
error = %error,
|
||||
elapsed_ms,
|
||||
prompt_chars,
|
||||
reference_image_count,
|
||||
request_params = %request_params
|
||||
.map(|value| value.to_string())
|
||||
.unwrap_or_default(),
|
||||
"VectorEngine 图片请求发送失败,准备重试"
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
|
||||
}
|
||||
|
||||
@@ -1,8 +1,20 @@
|
||||
use platform_image::vector_engine::{
|
||||
GPT_IMAGE_2_MODEL, VECTOR_ENGINE_PROVIDER, VectorEngineImageSettings,
|
||||
build_vector_engine_image_request_body, vector_engine_images_edit_url,
|
||||
GPT_IMAGE_2_MODEL, ReferenceImage, VECTOR_ENGINE_PROVIDER, VectorEngineImageSettings,
|
||||
build_vector_engine_image_http_client, build_vector_engine_image_request_body,
|
||||
create_vector_engine_image_edit, vector_engine_images_edit_url,
|
||||
vector_engine_images_generation_url,
|
||||
};
|
||||
use std::{
|
||||
sync::{
|
||||
Arc,
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::{
|
||||
io::{AsyncReadExt, AsyncWriteExt},
|
||||
net::TcpListener,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn vector_engine_module_exposes_provider_protocol_helpers() {
|
||||
@@ -30,3 +42,70 @@ fn vector_engine_module_exposes_provider_protocol_helpers() {
|
||||
"https://vector.example/v1/images/edits"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn vector_engine_image_edit_retries_send_timeout_once_and_succeeds() {
|
||||
let listener = TcpListener::bind("127.0.0.1:0")
|
||||
.await
|
||||
.expect("mock server should bind");
|
||||
let server_addr = listener
|
||||
.local_addr()
|
||||
.expect("mock server address should be readable");
|
||||
let request_count = Arc::new(AtomicUsize::new(0));
|
||||
let request_count_for_server = Arc::clone(&request_count);
|
||||
|
||||
let server = tokio::spawn(async move {
|
||||
loop {
|
||||
let Ok((mut stream, _)) = listener.accept().await else {
|
||||
break;
|
||||
};
|
||||
let request_index = request_count_for_server.fetch_add(1, Ordering::SeqCst);
|
||||
tokio::spawn(async move {
|
||||
let mut buffer = [0_u8; 4096];
|
||||
let _ = stream.read(&mut buffer).await;
|
||||
if request_index == 0 {
|
||||
tokio::time::sleep(Duration::from_millis(120)).await;
|
||||
return;
|
||||
}
|
||||
|
||||
let body = r#"{"data":[{"b64_json":"iVBORw0KGgpyZXN0"}]}"#;
|
||||
let response = format!(
|
||||
"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\nContent-Length: {}\r\n\r\n{}",
|
||||
body.len(),
|
||||
body
|
||||
);
|
||||
let _ = stream.write_all(response.as_bytes()).await;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
let settings = VectorEngineImageSettings {
|
||||
base_url: format!("http://{server_addr}/v1"),
|
||||
api_key: "test-key".to_string(),
|
||||
request_timeout_ms: 40,
|
||||
};
|
||||
let http_client =
|
||||
build_vector_engine_image_http_client(&settings).expect("client should build");
|
||||
let reference_image = ReferenceImage {
|
||||
bytes: b"reference".to_vec(),
|
||||
mime_type: "image/png".to_string(),
|
||||
file_name: "reference.png".to_string(),
|
||||
};
|
||||
|
||||
let generated = create_vector_engine_image_edit(
|
||||
&http_client,
|
||||
&settings,
|
||||
"测试提示词",
|
||||
None,
|
||||
"1024x1024",
|
||||
&reference_image,
|
||||
"测试 VectorEngine 图片编辑失败",
|
||||
)
|
||||
.await
|
||||
.expect("second attempt should return generated image");
|
||||
|
||||
assert_eq!(generated.images.len(), 1);
|
||||
assert_eq!(generated.images[0].mime_type, "image/png");
|
||||
assert_eq!(request_count.load(Ordering::SeqCst), 2);
|
||||
server.abort();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user