补充 release SpacetimeDB 健康检查与巡检防回退

增加 SpacetimeDB 阶段化健康检查与 /readyz 阶段输出
记录 procedure/reducer/read 失败的阶段和耗时
补充 release 健康巡检 systemd timer 与生产 ops 预检
同步 API 构建部署、provision 脚本和运维文档
This commit is contained in:
kdletters
2026-06-10 11:35:39 +08:00
parent 7aafb37f04
commit 9db467d23f
17 changed files with 1147 additions and 70 deletions

View File

@@ -269,6 +269,7 @@ mod tests {
};
use reqwest::Client;
use serde_json::Value;
use spacetime_client::{SpacetimeClientHealthSnapshot, SpacetimeClientStage};
use time::OffsetDateTime;
use tokio::net::TcpListener;
use tower::ServiceExt;
@@ -724,6 +725,45 @@ mod tests {
);
}
#[tokio::test]
async fn readyz_reports_spacetime_health_stage() {
let state = AppState::new(AppConfig::default()).expect("state should build");
state.set_test_spacetime_health(SpacetimeClientHealthSnapshot {
ok: false,
stage: SpacetimeClientStage::ProcedureResult,
checked_at_micros: 1_713_680_000_000_000,
elapsed_ms: 2_000,
timeout_ms: 2_000,
error: Some("SpacetimeDB procedure 调用超时".to_string()),
last_success_at_micros: Some(1_713_679_999_000_000),
last_error: Some("SpacetimeDB procedure 调用超时".to_string()),
});
let app = build_router(state);
let response = app
.oneshot(
Request::builder()
.uri("/readyz")
.header("x-request-id", "req-ready-spacetime")
.body(Body::empty())
.expect("readyz request should build"),
)
.await
.expect("readyz request should succeed");
assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
let body = read_json_response(response).await;
assert_eq!(body["error"]["details"]["reason"], "spacetime_unhealthy");
assert_eq!(
body["error"]["details"]["spacetime"]["stage"],
"procedure_result"
);
assert_eq!(
body["error"]["details"]["spacetime"]["timeoutMs"],
Value::from(2_000)
);
}
#[tokio::test]
async fn creative_agent_draft_edit_rejects_unconfirmed_template_session() {
let app = build_internal_creative_agent_app();

View File

@@ -12,6 +12,7 @@ use platform_speech::{
const DEFAULT_INTERNAL_API_SECRET: &str = "genarrative-dev-internal-bridge";
const SPACETIME_LOCAL_CONFIG_FILE: &str = "spacetime.local.json";
const DEFAULT_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS: u64 = 2;
pub(crate) const DEFAULT_VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS: u64 = 1_000_000;
// 集中管理 api-server 的启动配置,避免入口层直接散落环境变量解析逻辑。
@@ -118,6 +119,7 @@ pub struct AppConfig {
pub spacetime_token: Option<String>,
pub spacetime_pool_size: u32,
pub spacetime_procedure_timeout: Duration,
pub spacetime_health_check_timeout: Duration,
pub llm_provider: LlmProvider,
pub llm_base_url: String,
pub llm_api_key: Option<String>,
@@ -276,6 +278,9 @@ impl Default for AppConfig {
spacetime_token: None,
spacetime_pool_size: 4,
spacetime_procedure_timeout: Duration::from_secs(30),
spacetime_health_check_timeout: Duration::from_secs(
DEFAULT_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS,
),
llm_provider: LlmProvider::Ark,
llm_base_url: String::new(),
llm_api_key: None,
@@ -704,6 +709,12 @@ impl AppConfig {
config.spacetime_procedure_timeout =
Duration::from_secs(spacetime_procedure_timeout_seconds);
}
if let Some(spacetime_health_check_timeout_seconds) =
read_first_duration_seconds_env(&["GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS"])
{
config.spacetime_health_check_timeout =
Duration::from_secs(spacetime_health_check_timeout_seconds);
}
if let Some(llm_provider) =
read_first_llm_provider_env(&["GENARRATIVE_LLM_PROVIDER", "LLM_PROVIDER"])
@@ -1610,6 +1621,26 @@ mod tests {
}
}
#[test]
fn from_env_reads_spacetime_health_check_timeout() {
let _guard = ENV_LOCK
.get_or_init(|| Mutex::new(()))
.lock()
.expect("env lock should not poison");
unsafe {
std::env::remove_var("GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS");
std::env::set_var("GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS", "3");
}
let config = AppConfig::from_env();
assert_eq!(config.spacetime_health_check_timeout.as_secs(), 3);
unsafe {
std::env::remove_var("GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS");
}
}
#[test]
fn default_keeps_structured_llm_web_search_disabled() {
let config = AppConfig::default();

View File

@@ -10,6 +10,7 @@ use crate::{
api_response::json_success_body, http_error::AppError, request_context::RequestContext,
state::AppState,
};
use spacetime_client::SpacetimeClientHealthSnapshot;
pub async fn health_check(Extension(request_context): Extension<RequestContext>) -> Json<Value> {
json_success_body(
@@ -25,23 +26,49 @@ pub async fn readiness_check(
State(state): State<AppState>,
Extension(request_context): Extension<RequestContext>,
) -> Response {
if state.is_ready() {
if !state.is_ready() {
return AppError::from_status(StatusCode::SERVICE_UNAVAILABLE)
.with_message("api-server 正在退出,不再接收新流量")
.with_details(json!({
"reason": "api_server_draining",
"ready": false,
}))
.into_response_with_context(Some(&request_context));
}
let spacetime_health = state.spacetime_health_check().await;
if spacetime_health.ok {
return json_success_body(
Some(&request_context),
json!({
"ok": true,
"ready": true,
"service": "genarrative-api-server",
"spacetime": spacetime_health_to_json(&spacetime_health),
}),
)
.into_response();
}
AppError::from_status(StatusCode::SERVICE_UNAVAILABLE)
.with_message("api-server 正在退出,不再接收新流量")
.with_message("SpacetimeDB 连接健康检查失败,api-server 暂不接收新流量")
.with_details(json!({
"reason": "api_server_draining",
"reason": "spacetime_unhealthy",
"ready": false,
"spacetime": spacetime_health_to_json(&spacetime_health),
}))
.into_response_with_context(Some(&request_context))
}
fn spacetime_health_to_json(snapshot: &SpacetimeClientHealthSnapshot) -> Value {
json!({
"ok": snapshot.ok,
"stage": snapshot.stage.as_str(),
"checkedAtMicros": snapshot.checked_at_micros,
"elapsedMs": snapshot.elapsed_ms,
"timeoutMs": snapshot.timeout_ms,
"error": snapshot.error,
"lastSuccessAtMicros": snapshot.last_success_at_micros,
"lastError": snapshot.last_error,
})
}

View File

@@ -31,7 +31,9 @@ use platform_wechat::{WechatClient, WechatConfig, pay::WechatPayClient};
use serde_json::Value;
use shared_contracts::creation_entry_config::CreationEntryConfigResponse;
use shared_contracts::creative_agent::CreativeAgentSessionSnapshot;
use spacetime_client::{SpacetimeClient, SpacetimeClientConfig, SpacetimeClientError};
use spacetime_client::{
SpacetimeClient, SpacetimeClientConfig, SpacetimeClientError, SpacetimeClientHealthSnapshot,
};
use time::OffsetDateTime;
use tokio::sync::{Semaphore, broadcast};
use tracing::{info, warn};
@@ -242,6 +244,8 @@ pub struct AppStateInner {
refresh_cookie_config: RefreshCookieConfig,
#[cfg(test)]
test_creation_entry_config: Arc<Mutex<Option<CreationEntryConfigResponse>>>,
#[cfg(test)]
test_spacetime_health: Arc<Mutex<Option<SpacetimeClientHealthSnapshot>>>,
oss_client: Option<OssClient>,
#[cfg_attr(test, allow(dead_code))]
auth_store: InMemoryAuthStore,
@@ -418,6 +422,10 @@ impl AppState {
test_creation_entry_config: Arc::new(Mutex::new(Some(
crate::creation_entry_config::test_creation_entry_config_response(),
))),
#[cfg(test)]
test_spacetime_health: Arc::new(Mutex::new(Some(
SpacetimeClientHealthSnapshot::healthy_for_test(),
))),
oss_client,
auth_store,
password_entry_service,
@@ -467,6 +475,30 @@ impl AppState {
self.ready.store(false, Ordering::Release);
}
pub async fn spacetime_health_check(&self) -> SpacetimeClientHealthSnapshot {
#[cfg(test)]
if let Some(snapshot) = self
.test_spacetime_health
.lock()
.expect("test spacetime health should lock")
.clone()
{
return snapshot;
}
self.spacetime_client
.health_check(self.config.spacetime_health_check_timeout)
.await
}
#[cfg(test)]
pub(crate) fn set_test_spacetime_health(&self, snapshot: SpacetimeClientHealthSnapshot) {
*self
.test_spacetime_health
.lock()
.expect("test spacetime health should lock") = Some(snapshot);
}
pub async fn upsert_creation_entry_type_config(
&self,
input: module_runtime::CreationEntryTypeAdminUpsertInput,