feat(api-server): audit external api failures

This commit is contained in:
kdletters
2026-05-21 16:33:13 +08:00
parent 487efff9c4
commit cc23b6020d
19 changed files with 2266 additions and 56 deletions

View File

@@ -1049,6 +1049,7 @@ mod tests {
base_url: "https://vector.example".to_string(),
api_key: "secret".to_string(),
request_timeout_ms: 180_000,
external_api_audit_state: None,
});
assert_eq!(

View File

@@ -0,0 +1,372 @@
use axum::http::StatusCode;
use module_runtime::RuntimeTrackingScopeKind;
use serde_json::{Value, json};
use time::OffsetDateTime;
use uuid::Uuid;
use crate::{state::AppState, tracking::TrackingEventDraft};
pub(crate) const EXTERNAL_API_FAILURE_EVENT_KEY: &str = "external_api_call_failure";
pub(crate) const EXTERNAL_API_AUDIT_MODULE_KEY: &str = "external-api";
#[derive(Clone, Debug)]
pub(crate) struct ExternalApiFailureDraft {
pub(crate) provider: &'static str,
pub(crate) endpoint: String,
pub(crate) operation: String,
pub(crate) failure_stage: &'static str,
pub(crate) status_code: Option<u16>,
pub(crate) status_class: Option<&'static str>,
pub(crate) timeout: bool,
pub(crate) retryable: bool,
pub(crate) error_message: String,
pub(crate) error_source: Option<String>,
pub(crate) raw_excerpt: Option<String>,
pub(crate) latency_ms: Option<u64>,
pub(crate) prompt_chars: Option<usize>,
pub(crate) reference_image_count: Option<usize>,
pub(crate) image_model: Option<&'static str>,
}
impl ExternalApiFailureDraft {
pub(crate) fn new(
provider: &'static str,
endpoint: impl Into<String>,
operation: impl Into<String>,
failure_stage: &'static str,
error_message: impl Into<String>,
) -> Self {
Self {
provider,
endpoint: endpoint.into(),
operation: operation.into(),
failure_stage,
status_code: None,
status_class: None,
timeout: false,
retryable: false,
error_message: error_message.into(),
error_source: None,
raw_excerpt: None,
latency_ms: None,
prompt_chars: None,
reference_image_count: None,
image_model: None,
}
}
pub(crate) fn with_status_code(mut self, status_code: Option<u16>) -> Self {
self.status_code = status_code;
self
}
pub(crate) fn with_optional_status_class(mut self, status_class: Option<&'static str>) -> Self {
self.status_class = status_class;
self
}
pub(crate) fn with_timeout(mut self, timeout: bool) -> Self {
self.timeout = timeout;
self
}
pub(crate) fn with_retryable(mut self, retryable: bool) -> Self {
self.retryable = retryable;
self
}
pub(crate) fn with_error_source(mut self, error_source: Option<String>) -> Self {
self.error_source = error_source;
self
}
pub(crate) fn with_raw_excerpt(mut self, raw_excerpt: Option<String>) -> Self {
self.raw_excerpt = raw_excerpt;
self
}
pub(crate) fn with_latency_ms(mut self, latency_ms: Option<u64>) -> Self {
self.latency_ms = latency_ms;
self
}
pub(crate) fn with_prompt_chars(mut self, prompt_chars: Option<usize>) -> Self {
self.prompt_chars = prompt_chars;
self
}
pub(crate) fn with_reference_image_count(
mut self,
reference_image_count: Option<usize>,
) -> Self {
self.reference_image_count = reference_image_count;
self
}
pub(crate) fn with_image_model(mut self, image_model: Option<&'static str>) -> Self {
self.image_model = image_model;
self
}
}
/// 中文注释下载图片、OSS 读写等非标准 HTTP 状态统一显式归类,避免 OTLP 低基数 label 误落到 `transport`。
pub(crate) fn app_error_status_class(status_code: StatusCode) -> &'static str {
status_class(Some(status_code.as_u16()))
}
/// 中文注释:外部供应商失败同时进入 OTLP 和 tracking_event失败审计不能反向阻断主业务错误返回。
pub(crate) async fn record_external_api_failure(state: &AppState, draft: ExternalApiFailureDraft) {
record_external_api_failure_otlp(&draft);
let tracking_event = build_external_api_failure_tracking_draft(&draft);
if let Some(outbox) = state.tracking_outbox() {
match outbox
.enqueue(crate::tracking::build_tracking_event_input(
tracking_event.clone(),
))
.await
{
Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Enqueued) => {}
Ok(crate::tracking_outbox::TrackingOutboxEnqueueOutcome::Dropped { reason }) => {
tracing::warn!(
provider = draft.provider,
endpoint = %draft.endpoint,
operation = %draft.operation,
failure_stage = draft.failure_stage,
reason,
"外部 API 失败审计写入 outbox 被保护阈值拒绝,回退同步直写 SpacetimeDB"
);
crate::tracking::record_tracking_event_after_success(
state,
&audit_request_context(),
tracking_event,
)
.await;
}
Err(error) => {
tracing::warn!(
provider = draft.provider,
endpoint = %draft.endpoint,
operation = %draft.operation,
failure_stage = draft.failure_stage,
error = %error,
"外部 API 失败审计写入 outbox 失败,回退同步直写 SpacetimeDB"
);
crate::tracking::record_tracking_event_after_success(
state,
&audit_request_context(),
tracking_event,
)
.await;
}
}
return;
}
crate::tracking::record_tracking_event_after_success(
state,
&audit_request_context(),
tracking_event,
)
.await;
}
pub(crate) fn build_external_api_failure_tracking_draft(
failure: &ExternalApiFailureDraft,
) -> TrackingEventDraft {
let mut draft = TrackingEventDraft::new(
EXTERNAL_API_FAILURE_EVENT_KEY,
EXTERNAL_API_AUDIT_MODULE_KEY,
);
draft.scope_kind = RuntimeTrackingScopeKind::Module;
draft.scope_id = failure.provider.to_string();
draft.metadata = build_external_api_failure_metadata(failure);
draft
}
fn build_external_api_failure_metadata(failure: &ExternalApiFailureDraft) -> Value {
let mut metadata = json!({
"provider": failure.provider,
"endpoint": failure.endpoint,
"operation": failure.operation,
"failureStage": failure.failure_stage,
"statusCode": failure.status_code,
"statusClass": failure.status_class.unwrap_or_else(|| status_class(failure.status_code)),
"timeout": failure.timeout,
"retryable": failure.retryable,
"errorMessage": truncate_field(failure.error_message.as_str(), 1_000),
"occurredAt": current_utc_iso_text(),
});
if let Some(latency_ms) = failure.latency_ms {
metadata["latencyMs"] = json!(latency_ms);
}
if let Some(prompt_chars) = failure.prompt_chars {
metadata["promptChars"] = json!(prompt_chars);
}
if let Some(reference_image_count) = failure.reference_image_count {
metadata["referenceImageCount"] = json!(reference_image_count);
}
if let Some(image_model) = failure.image_model {
metadata["imageModel"] = json!(image_model);
}
if let Some(source) = failure
.error_source
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
{
metadata["errorSource"] = json!(truncate_field(source, 1_000));
}
if let Some(excerpt) = failure
.raw_excerpt
.as_deref()
.map(str::trim)
.filter(|value| !value.is_empty())
{
metadata["rawExcerpt"] = json!(truncate_field(excerpt, 800));
}
metadata
}
pub(crate) fn is_retryable_external_api_failure(
status_code: Option<u16>,
timeout: bool,
connect: bool,
) -> bool {
timeout
|| connect
|| status_code.is_some_and(|status| {
status == StatusCode::TOO_MANY_REQUESTS.as_u16()
|| status == StatusCode::REQUEST_TIMEOUT.as_u16()
|| status >= 500
})
}
fn record_external_api_failure_otlp(failure: &ExternalApiFailureDraft) {
crate::telemetry::record_external_api_failure(
failure.provider,
failure.failure_stage,
failure
.status_class
.unwrap_or_else(|| status_class(failure.status_code)),
failure.retryable,
);
tracing::error!(
provider = failure.provider,
endpoint = %failure.endpoint,
operation = %failure.operation,
failure_stage = failure.failure_stage,
status_code = failure.status_code,
status_class = failure.status_class.unwrap_or_else(|| status_class(failure.status_code)),
timeout = failure.timeout,
retryable = failure.retryable,
latency_ms = failure.latency_ms,
prompt_chars = failure.prompt_chars,
reference_image_count = failure.reference_image_count,
image_model = failure.image_model,
error = %failure.error_message,
"外部 API 调用失败"
);
}
fn status_class(status_code: Option<u16>) -> &'static str {
match status_code {
Some(100..=199) => "1xx",
Some(200..=299) => "2xx",
Some(300..=399) => "3xx",
Some(400..=499) => "4xx",
Some(500..=599) => "5xx",
Some(_) => "unknown",
None => "transport",
}
}
fn audit_request_context() -> crate::request_context::RequestContext {
crate::request_context::RequestContext::new(
format!("external-api-audit-{}", Uuid::new_v4()),
"external-api audit".to_string(),
std::time::Duration::ZERO,
false,
)
}
fn truncate_field(value: &str, max_chars: usize) -> String {
value.chars().take(max_chars).collect()
}
fn current_utc_iso_text() -> String {
shared_kernel::format_rfc3339(OffsetDateTime::now_utc())
.unwrap_or_else(|_| "1970-01-01T00:00:00Z".to_string())
}
#[cfg(test)]
mod tests {
use serde_json::Value;
use super::*;
#[test]
fn external_api_failure_tracking_draft_uses_module_scope_and_safe_metadata() {
let draft = build_external_api_failure_tracking_draft(
&ExternalApiFailureDraft::new(
"vector-engine",
"https://vector.example/v1/images/generations",
"拼图 UI 背景图生成失败",
"upstream_status",
"上游 429",
)
.with_status_code(Some(429))
.with_retryable(true)
.with_latency_ms(Some(1234))
.with_prompt_chars(Some(88))
.with_reference_image_count(Some(2))
.with_image_model(Some("gpt-image-2-all")),
);
assert_eq!(draft.event_key, EXTERNAL_API_FAILURE_EVENT_KEY);
assert_eq!(draft.scope_kind, RuntimeTrackingScopeKind::Module);
assert_eq!(draft.scope_id, "vector-engine");
assert_eq!(draft.module_key, Some(EXTERNAL_API_AUDIT_MODULE_KEY));
let metadata = draft.metadata;
assert_eq!(metadata["provider"], "vector-engine");
assert_eq!(metadata["statusCode"], 429);
assert_eq!(metadata["statusClass"], "4xx");
assert_eq!(metadata["retryable"], true);
assert_eq!(metadata["latencyMs"], 1234);
assert_eq!(metadata["promptChars"], 88);
assert_eq!(metadata["referenceImageCount"], 2);
assert_eq!(metadata["imageModel"], "gpt-image-2-all");
assert!(matches!(metadata["occurredAt"], Value::String(_)));
}
#[test]
fn retryable_classification_keeps_transport_and_overload_failures_actionable() {
assert!(is_retryable_external_api_failure(None, true, false));
assert!(is_retryable_external_api_failure(None, false, true));
assert!(is_retryable_external_api_failure(Some(429), false, false));
assert!(is_retryable_external_api_failure(Some(502), false, false));
assert!(!is_retryable_external_api_failure(Some(400), false, false));
}
#[test]
fn app_error_status_class_can_override_successful_upstream_status() {
let draft = build_external_api_failure_tracking_draft(
&ExternalApiFailureDraft::new(
"vector-engine",
"https://cdn.example/generated.png",
"下载生成图片",
"image_download",
"下载生成图片失败",
)
.with_status_code(Some(200))
.with_optional_status_class(Some(app_error_status_class(StatusCode::BAD_GATEWAY))),
);
assert_eq!(draft.metadata["statusCode"], 200);
assert_eq!(draft.metadata["statusClass"], "5xx");
}
}

View File

@@ -39,6 +39,7 @@ mod custom_world_rpg_draft_prompts;
mod edutainment_baby_drawing;
mod edutainment_baby_object;
mod error_middleware;
mod external_api_audit;
pub(crate) mod generated_asset_sheets;
mod generated_image_assets;
mod health;

View File

@@ -1,21 +1,44 @@
use std::time::Duration;
use std::{error::Error, time::Duration};
use axum::http::StatusCode;
use base64::{Engine as _, engine::general_purpose::STANDARD as BASE64_STANDARD};
use reqwest::header;
use serde_json::{Map, Value, json};
use crate::{http_error::AppError, state::AppState};
use crate::{
external_api_audit::{
ExternalApiFailureDraft, app_error_status_class, is_retryable_external_api_failure,
record_external_api_failure,
},
http_error::AppError,
state::AppState,
};
pub(crate) const GPT_IMAGE_2_MODEL: &str = "gpt-image-2";
pub(crate) const VECTOR_ENGINE_GPT_IMAGE_2_MODEL: &str = "gpt-image-2-all";
const VECTOR_ENGINE_PROVIDER: &str = "vector-engine";
#[derive(Clone, Debug)]
#[derive(Clone)]
pub(crate) struct OpenAiImageSettings {
pub base_url: String,
pub api_key: String,
pub request_timeout_ms: u64,
pub external_api_audit_state: Option<AppState>,
}
impl std::fmt::Debug for OpenAiImageSettings {
fn fmt(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
formatter
.debug_struct("OpenAiImageSettings")
.field("base_url", &self.base_url)
.field("api_key", &"<redacted>")
.field("request_timeout_ms", &self.request_timeout_ms)
.field(
"external_api_audit_enabled",
&self.external_api_audit_state.is_some(),
)
.finish()
}
}
#[derive(Clone, Debug)]
@@ -74,6 +97,7 @@ pub(crate) fn require_openai_image_settings(
base_url: base_url.to_string(),
api_key: api_key.to_string(),
request_timeout_ms: state.config.vector_engine_image_request_timeout_ms.max(1),
external_api_audit_state: Some(state.clone()),
})
}
@@ -103,15 +127,18 @@ pub(crate) async fn create_openai_image_generation(
reference_images: &[String],
failure_context: &str,
) -> Result<OpenAiGeneratedImages, AppError> {
let request_url = vector_engine_images_generation_url(settings);
let normalized_size = normalize_image_size(size);
let request_body = build_openai_image_request_body(
prompt,
negative_prompt,
size,
normalized_size.as_str(),
candidate_count,
reference_images,
);
let response = http_client
.post(vector_engine_images_generation_url(settings))
let started_at = std::time::Instant::now();
let response = match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
@@ -121,16 +148,106 @@ pub(crate) async fn create_openai_image_generation(
.json(&request_body)
.send()
.await
.map_err(|error| {
map_openai_image_request_error(format!(
"{failure_context}:创建图片生成任务失败:{error}"
))
})?;
{
Ok(response) => response,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:创建图片生成任务失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"request_send",
None,
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:创建图片生成任务失败").as_str(),
request_url.as_str(),
error,
));
}
};
let response_status = response.status();
let response_text = response.text().await.map_err(|error| {
map_openai_image_request_error(format!("{failure_context}:读取图片生成响应失败:{error}"))
})?;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
status = response_status.as_u16(),
prompt_chars = prompt.chars().count(),
size = %normalized_size,
reference_image_count = reference_images.len(),
elapsed_ms = started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片生成 HTTP 返回"
);
let response_text = match response.text().await {
Ok(response_text) => response_text,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:读取图片生成响应失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_body",
Some(response_status.as_u16()),
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:读取图片生成响应失败").as_str(),
request_url.as_str(),
error,
));
}
};
if !response_status.is_success() {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"upstream_status",
Some(response_status.as_u16()),
None,
false,
false,
parse_api_error_message(response_text.as_str(), failure_context).as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(map_openai_image_upstream_error(
response_status.as_u16(),
response_text.as_str(),
@@ -138,26 +255,114 @@ pub(crate) async fn create_openai_image_generation(
));
}
let response_json = parse_json_payload(response_text.as_str(), failure_context)?;
let response_json = match parse_json_payload(response_text.as_str(), failure_context) {
Ok(response_json) => response_json,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_parse",
Some(response_status.as_u16()),
None,
false,
false,
error.body_text().as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(error);
}
};
let generation_id = extract_generation_id(&response_json.payload)
.unwrap_or_else(|| format!("vector-engine-{}", current_utc_micros()));
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
let image_urls = extract_image_urls(&response_json.payload);
if !image_urls.is_empty() {
let mut generated =
download_images_from_urls(http_client, generation_id, image_urls, candidate_count)
.await?;
let download_started_at = std::time::Instant::now();
let mut generated = match download_images_from_urls(
http_client,
generation_id,
image_urls,
candidate_count,
)
.await
{
Ok(generated) => generated,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"image_download",
Some(response_status.as_u16()),
Some(app_error_status_class(error.status_code())),
false,
false,
error.body_text().as_str(),
None,
None,
Some(download_started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
return Err(error);
}
};
generated.actual_prompt = actual_prompt;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
image_count = generated.images.len(),
elapsed_ms = download_started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片下载完成"
);
return Ok(generated);
}
let b64_images = extract_b64_images(&response_json.payload);
if !b64_images.is_empty() {
let mut generated = images_from_base64(generation_id, b64_images, candidate_count);
generated.actual_prompt = actual_prompt;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
image_count = generated.images.len(),
failure_context,
"VectorEngine 图片 base64 解码完成"
);
return Ok(generated);
}
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"missing_image",
Some(response_status.as_u16()),
None,
false,
false,
format!("{failure_context}VectorEngine 未返回图片地址").as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(reference_images.len()),
),
)
.await;
Err(
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
"provider": VECTOR_ENGINE_PROVIDER,
@@ -176,6 +381,8 @@ pub(crate) async fn create_openai_image_edit(
failure_context: &str,
) -> Result<OpenAiGeneratedImages, AppError> {
let task_id = format!("vector-engine-edit-{}", current_utc_micros());
let request_url = vector_engine_images_edit_url(settings);
let normalized_size = normalize_image_size(size);
let image_part = reqwest::multipart::Part::bytes(reference_image.bytes.clone())
.file_name(reference_image.file_name.clone())
.mime_str(reference_image.mime_type.as_str())
@@ -190,9 +397,10 @@ pub(crate) async fn create_openai_image_edit(
build_prompt_with_negative(prompt, negative_prompt),
)
.text("n", "1")
.text("size", normalize_image_size(size));
let response = http_client
.post(vector_engine_images_edit_url(settings).as_str())
.text("size", normalized_size.clone());
let started_at = std::time::Instant::now();
let response = match http_client
.post(request_url.as_str())
.header(
header::AUTHORIZATION,
format!("Bearer {}", settings.api_key),
@@ -201,16 +409,106 @@ pub(crate) async fn create_openai_image_edit(
.multipart(form)
.send()
.await
.map_err(|error| {
map_openai_image_request_error(format!(
"{failure_context}:创建图片编辑任务失败:{error}"
))
})?;
{
Ok(response) => response,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:创建图片编辑任务失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"request_send",
None,
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:创建图片编辑任务失败").as_str(),
request_url.as_str(),
error,
));
}
};
let response_status = response.status();
let response_text = response.text().await.map_err(|error| {
map_openai_image_request_error(format!("{failure_context}:读取图片编辑响应失败:{error}"))
})?;
tracing::info!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
status = response_status.as_u16(),
prompt_chars = prompt.chars().count(),
size = %normalized_size,
reference_image_count = 1usize,
elapsed_ms = started_at.elapsed().as_millis() as u64,
failure_context,
"VectorEngine 图片编辑 HTTP 返回"
);
let response_text = match response.text().await {
Ok(response_text) => response_text,
Err(error) => {
let latency_ms = started_at.elapsed().as_millis() as u64;
let timeout = error.is_timeout();
let connect = error.is_connect();
let source = error.source().map(ToString::to_string);
let message = format!("{failure_context}:读取图片编辑响应失败:{error}");
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_body",
Some(response_status.as_u16()),
None,
timeout,
connect,
message.as_str(),
source,
None,
Some(latency_ms),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(map_openai_image_reqwest_error(
format!("{failure_context}:读取图片编辑响应失败").as_str(),
request_url.as_str(),
error,
));
}
};
if !response_status.is_success() {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"upstream_status",
Some(response_status.as_u16()),
None,
false,
false,
parse_api_error_message(response_text.as_str(), failure_context).as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(map_openai_image_upstream_error(
response_status.as_u16(),
response_text.as_str(),
@@ -218,12 +516,62 @@ pub(crate) async fn create_openai_image_edit(
));
}
let response_json = parse_json_payload(response_text.as_str(), failure_context)?;
let response_json = match parse_json_payload(response_text.as_str(), failure_context) {
Ok(response_json) => response_json,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"response_parse",
Some(response_status.as_u16()),
None,
false,
false,
error.body_text().as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(error);
}
};
let actual_prompt = find_first_string_by_key(&response_json.payload, "revised_prompt")
.or_else(|| find_first_string_by_key(&response_json.payload, "actual_prompt"));
let image_urls = extract_image_urls(&response_json.payload);
if !image_urls.is_empty() {
let mut generated = download_images_from_urls(http_client, task_id, image_urls, 1).await?;
let download_started_at = std::time::Instant::now();
let mut generated =
match download_images_from_urls(http_client, task_id, image_urls, 1).await {
Ok(generated) => generated,
Err(error) => {
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"image_download",
Some(response_status.as_u16()),
Some(app_error_status_class(error.status_code())),
false,
false,
error.body_text().as_str(),
None,
None,
Some(download_started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
return Err(error);
}
};
generated.actual_prompt = actual_prompt;
return Ok(generated);
}
@@ -234,6 +582,25 @@ pub(crate) async fn create_openai_image_edit(
return Ok(generated);
}
record_openai_image_failure_if_configured(
settings,
build_openai_image_failure_audit_draft(
request_url.as_str(),
failure_context,
"missing_image",
Some(response_status.as_u16()),
None,
false,
false,
format!("{failure_context}VectorEngine 未返回编辑图片").as_str(),
None,
Some(truncate_raw(response_text.as_str())),
Some(started_at.elapsed().as_millis() as u64),
Some(prompt.chars().count()),
Some(1),
),
)
.await;
Err(
AppError::from_status(StatusCode::BAD_GATEWAY).with_details(json!({
"provider": VECTOR_ENGINE_PROVIDER,
@@ -402,6 +769,44 @@ fn map_openai_image_request_error(message: String) -> AppError {
}))
}
fn map_openai_image_reqwest_error(
context: &str,
request_url: &str,
error: reqwest::Error,
) -> AppError {
let is_timeout = error.is_timeout();
let is_connect = error.is_connect();
let source = error.source().map(ToString::to_string).unwrap_or_default();
let message = format!("{context}{error}");
let status = if is_timeout {
StatusCode::GATEWAY_TIMEOUT
} else {
StatusCode::BAD_GATEWAY
};
tracing::warn!(
provider = VECTOR_ENGINE_PROVIDER,
endpoint = %request_url,
timeout = is_timeout,
connect = is_connect,
request = error.is_request(),
body = error.is_body(),
source = %source,
message = %message,
"VectorEngine 图片请求发送失败"
);
AppError::from_status(status).with_details(json!({
"provider": VECTOR_ENGINE_PROVIDER,
"message": message,
"endpoint": request_url,
"timeout": is_timeout,
"connect": is_connect,
"request": error.is_request(),
"body": error.is_body(),
"source": source,
}))
}
fn map_openai_image_upstream_error(
upstream_status: u16,
raw_text: &str,
@@ -423,6 +828,53 @@ fn map_openai_image_upstream_error(
}))
}
async fn record_openai_image_failure_if_configured(
settings: &OpenAiImageSettings,
draft: ExternalApiFailureDraft,
) {
if let Some(state) = settings.external_api_audit_state.as_ref() {
record_external_api_failure(state, draft).await;
}
}
fn build_openai_image_failure_audit_draft(
request_url: &str,
failure_context: &str,
failure_stage: &'static str,
status_code: Option<u16>,
status_class: Option<&'static str>,
timeout: bool,
connect: bool,
error_message: &str,
error_source: Option<String>,
raw_excerpt: Option<String>,
latency_ms: Option<u64>,
prompt_chars: Option<usize>,
reference_image_count: Option<usize>,
) -> ExternalApiFailureDraft {
ExternalApiFailureDraft::new(
VECTOR_ENGINE_PROVIDER,
request_url.to_string(),
failure_context.to_string(),
failure_stage,
error_message.to_string(),
)
.with_status_code(status_code)
.with_optional_status_class(status_class)
.with_timeout(timeout)
.with_retryable(is_retryable_external_api_failure(
status_code,
timeout,
connect,
))
.with_error_source(error_source)
.with_raw_excerpt(raw_excerpt)
.with_latency_ms(latency_ms)
.with_prompt_chars(prompt_chars)
.with_reference_image_count(reference_image_count)
.with_image_model(Some(VECTOR_ENGINE_GPT_IMAGE_2_MODEL))
}
fn parse_api_error_message(raw_text: &str, fallback_message: &str) -> String {
if raw_text.trim().is_empty() {
return fallback_message.to_string();
@@ -629,11 +1081,13 @@ mod tests {
base_url: "https://vector.example".to_string(),
api_key: "test-key".to_string(),
request_timeout_ms: 1_000_000,
external_api_audit_state: None,
};
let v1_settings = OpenAiImageSettings {
base_url: "https://vector.example/v1".to_string(),
api_key: "test-key".to_string(),
request_timeout_ms: 1_000_000,
external_api_audit_state: None,
};
assert_eq!(
@@ -658,4 +1112,41 @@ mod tests {
assert_eq!(images.images[0].mime_type, "image/png");
assert_eq!(images.images[0].extension, "png");
}
#[test]
fn vector_engine_upstream_failure_builds_tracking_ready_audit_event() {
let audit = build_openai_image_failure_audit_draft(
"https://vector.example/v1/images/generations",
"拼图 UI 背景图生成失败",
"upstream_status",
Some(429),
None,
false,
false,
"上游限流",
None,
Some("{\"error\":\"rate limited\"}".to_string()),
Some(321),
Some(42),
Some(1),
);
let tracking = crate::external_api_audit::build_external_api_failure_tracking_draft(&audit);
assert_eq!(
tracking.event_key,
crate::external_api_audit::EXTERNAL_API_FAILURE_EVENT_KEY
);
assert_eq!(tracking.scope_id, VECTOR_ENGINE_PROVIDER);
assert_eq!(tracking.metadata["provider"], VECTOR_ENGINE_PROVIDER);
assert_eq!(tracking.metadata["statusCode"], 429);
assert_eq!(tracking.metadata["statusClass"], "4xx");
assert_eq!(tracking.metadata["failureStage"], "upstream_status");
assert_eq!(tracking.metadata["retryable"], true);
assert_eq!(tracking.metadata["promptChars"], 42);
assert_eq!(tracking.metadata["referenceImageCount"], 1);
assert_eq!(
tracking.metadata["imageModel"],
VECTOR_ENGINE_GPT_IMAGE_2_MODEL
);
}
}

View File

@@ -172,6 +172,23 @@ pub(crate) fn update_tracking_outbox_pending_files(files: usize) {
TRACKING_OUTBOX_PENDING_FILES.store(files.min(i64::MAX as usize) as i64, Ordering::Relaxed);
}
pub(crate) fn record_external_api_failure(
provider: &'static str,
failure_stage: &'static str,
status_class: &'static str,
retryable: bool,
) {
external_api_metrics().failures.add(
1,
&[
KeyValue::new("provider", provider),
KeyValue::new("failure_stage", failure_stage),
KeyValue::new("status_class", status_class),
KeyValue::new("retryable", retryable),
],
);
}
fn track_response_body_in_flight(response: Response<Body>) -> Response<Body> {
response.map(|body| {
HTTP_RESPONSE_BODY_IN_FLIGHT.fetch_add(1, Ordering::Relaxed);
@@ -211,6 +228,10 @@ struct TrackingOutboxMetrics {
flushed_bytes: Counter<u64>,
}
struct ExternalApiMetrics {
failures: Counter<u64>,
}
struct HttpRequestPermitsAvailableGauges {
default: Arc<AtomicI64>,
gallery: Arc<AtomicI64>,
@@ -359,6 +380,21 @@ fn tracking_outbox_metrics() -> &'static TrackingOutboxMetrics {
})
}
fn external_api_metrics() -> &'static ExternalApiMetrics {
static METRICS: std::sync::OnceLock<ExternalApiMetrics> = std::sync::OnceLock::new();
METRICS.get_or_init(|| {
let meter = global::meter("genarrative-api");
ExternalApiMetrics {
failures: meter
.u64_counter("genarrative.external_api.failures")
.with_description(
"External API call failures grouped by provider and failure stage",
)
.build(),
}
})
}
fn register_http_request_permits_available_metric() -> HttpRequestPermitsAvailableGauges {
let gauges = HttpRequestPermitsAvailableGauges::new();
let meter = global::meter("genarrative-api");

View File

@@ -584,6 +584,26 @@ async fn record_route_tracking_event_via_outbox_after_success(
record_tracking_event_input_after_success(state, request_context, event).await;
}
pub(crate) fn build_tracking_event_input(
draft: TrackingEventDraft,
) -> module_runtime::RuntimeTrackingEventInput {
let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000;
let event_id = build_tracking_event_id(&draft, occurred_at_micros);
module_runtime::RuntimeTrackingEventInput {
event_id,
event_key: draft.event_key.to_string(),
scope_kind: draft.scope_kind,
scope_id: draft.scope_id,
user_id: draft.user_id,
owner_user_id: draft.owner_user_id,
profile_id: draft.profile_id,
module_key: draft.module_key.map(str::to_string),
metadata_json: draft.metadata.to_string(),
occurred_at_micros: occurred_at_micros as i64,
}
}
async fn record_tracking_event_input_after_success(
state: &AppState,
request_context: &RequestContext,
@@ -642,26 +662,6 @@ async fn record_tracking_event_input_after_success(
}
}
fn build_tracking_event_input(
draft: TrackingEventDraft,
) -> module_runtime::RuntimeTrackingEventInput {
let occurred_at_micros = OffsetDateTime::now_utc().unix_timestamp_nanos() / 1_000;
let event_id = build_tracking_event_id(&draft, occurred_at_micros);
module_runtime::RuntimeTrackingEventInput {
event_id,
event_key: draft.event_key.to_string(),
scope_kind: draft.scope_kind,
scope_id: draft.scope_id,
user_id: draft.user_id,
owner_user_id: draft.owner_user_id,
profile_id: draft.profile_id,
module_key: draft.module_key.map(str::to_string),
metadata_json: draft.metadata.to_string(),
occurred_at_micros: occurred_at_micros as i64,
}
}
fn build_tracking_event_id(draft: &TrackingEventDraft, occurred_at_micros: i128) -> String {
if draft.event_key == "daily_login"
&& draft.scope_kind == RuntimeTrackingScopeKind::User