补充 release SpacetimeDB 健康检查与巡检防回退
增加 SpacetimeDB 阶段化健康检查与 /readyz 阶段输出 记录 procedure/reducer/read 失败的阶段和耗时 补充 release 健康巡检 systemd timer 与生产 ops 预检 同步 API 构建部署、provision 脚本和运维文档
This commit is contained in:
@@ -269,6 +269,7 @@ mod tests {
|
||||
};
|
||||
use reqwest::Client;
|
||||
use serde_json::Value;
|
||||
use spacetime_client::{SpacetimeClientHealthSnapshot, SpacetimeClientStage};
|
||||
use time::OffsetDateTime;
|
||||
use tokio::net::TcpListener;
|
||||
use tower::ServiceExt;
|
||||
@@ -724,6 +725,45 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn readyz_reports_spacetime_health_stage() {
|
||||
let state = AppState::new(AppConfig::default()).expect("state should build");
|
||||
state.set_test_spacetime_health(SpacetimeClientHealthSnapshot {
|
||||
ok: false,
|
||||
stage: SpacetimeClientStage::ProcedureResult,
|
||||
checked_at_micros: 1_713_680_000_000_000,
|
||||
elapsed_ms: 2_000,
|
||||
timeout_ms: 2_000,
|
||||
error: Some("SpacetimeDB procedure 调用超时".to_string()),
|
||||
last_success_at_micros: Some(1_713_679_999_000_000),
|
||||
last_error: Some("SpacetimeDB procedure 调用超时".to_string()),
|
||||
});
|
||||
let app = build_router(state);
|
||||
|
||||
let response = app
|
||||
.oneshot(
|
||||
Request::builder()
|
||||
.uri("/readyz")
|
||||
.header("x-request-id", "req-ready-spacetime")
|
||||
.body(Body::empty())
|
||||
.expect("readyz request should build"),
|
||||
)
|
||||
.await
|
||||
.expect("readyz request should succeed");
|
||||
|
||||
assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE);
|
||||
let body = read_json_response(response).await;
|
||||
assert_eq!(body["error"]["details"]["reason"], "spacetime_unhealthy");
|
||||
assert_eq!(
|
||||
body["error"]["details"]["spacetime"]["stage"],
|
||||
"procedure_result"
|
||||
);
|
||||
assert_eq!(
|
||||
body["error"]["details"]["spacetime"]["timeoutMs"],
|
||||
Value::from(2_000)
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn creative_agent_draft_edit_rejects_unconfirmed_template_session() {
|
||||
let app = build_internal_creative_agent_app();
|
||||
|
||||
@@ -12,6 +12,7 @@ use platform_speech::{
|
||||
|
||||
const DEFAULT_INTERNAL_API_SECRET: &str = "genarrative-dev-internal-bridge";
|
||||
const SPACETIME_LOCAL_CONFIG_FILE: &str = "spacetime.local.json";
|
||||
const DEFAULT_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS: u64 = 2;
|
||||
pub(crate) const DEFAULT_VECTOR_ENGINE_IMAGE_REQUEST_TIMEOUT_MS: u64 = 1_000_000;
|
||||
|
||||
// 集中管理 api-server 的启动配置,避免入口层直接散落环境变量解析逻辑。
|
||||
@@ -118,6 +119,7 @@ pub struct AppConfig {
|
||||
pub spacetime_token: Option<String>,
|
||||
pub spacetime_pool_size: u32,
|
||||
pub spacetime_procedure_timeout: Duration,
|
||||
pub spacetime_health_check_timeout: Duration,
|
||||
pub llm_provider: LlmProvider,
|
||||
pub llm_base_url: String,
|
||||
pub llm_api_key: Option<String>,
|
||||
@@ -276,6 +278,9 @@ impl Default for AppConfig {
|
||||
spacetime_token: None,
|
||||
spacetime_pool_size: 4,
|
||||
spacetime_procedure_timeout: Duration::from_secs(30),
|
||||
spacetime_health_check_timeout: Duration::from_secs(
|
||||
DEFAULT_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS,
|
||||
),
|
||||
llm_provider: LlmProvider::Ark,
|
||||
llm_base_url: String::new(),
|
||||
llm_api_key: None,
|
||||
@@ -704,6 +709,12 @@ impl AppConfig {
|
||||
config.spacetime_procedure_timeout =
|
||||
Duration::from_secs(spacetime_procedure_timeout_seconds);
|
||||
}
|
||||
if let Some(spacetime_health_check_timeout_seconds) =
|
||||
read_first_duration_seconds_env(&["GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS"])
|
||||
{
|
||||
config.spacetime_health_check_timeout =
|
||||
Duration::from_secs(spacetime_health_check_timeout_seconds);
|
||||
}
|
||||
|
||||
if let Some(llm_provider) =
|
||||
read_first_llm_provider_env(&["GENARRATIVE_LLM_PROVIDER", "LLM_PROVIDER"])
|
||||
@@ -1610,6 +1621,26 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn from_env_reads_spacetime_health_check_timeout() {
|
||||
let _guard = ENV_LOCK
|
||||
.get_or_init(|| Mutex::new(()))
|
||||
.lock()
|
||||
.expect("env lock should not poison");
|
||||
|
||||
unsafe {
|
||||
std::env::remove_var("GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS");
|
||||
std::env::set_var("GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS", "3");
|
||||
}
|
||||
|
||||
let config = AppConfig::from_env();
|
||||
assert_eq!(config.spacetime_health_check_timeout.as_secs(), 3);
|
||||
|
||||
unsafe {
|
||||
std::env::remove_var("GENARRATIVE_SPACETIME_HEALTH_CHECK_TIMEOUT_SECONDS");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_keeps_structured_llm_web_search_disabled() {
|
||||
let config = AppConfig::default();
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::{
|
||||
api_response::json_success_body, http_error::AppError, request_context::RequestContext,
|
||||
state::AppState,
|
||||
};
|
||||
use spacetime_client::SpacetimeClientHealthSnapshot;
|
||||
|
||||
pub async fn health_check(Extension(request_context): Extension<RequestContext>) -> Json<Value> {
|
||||
json_success_body(
|
||||
@@ -25,23 +26,49 @@ pub async fn readiness_check(
|
||||
State(state): State<AppState>,
|
||||
Extension(request_context): Extension<RequestContext>,
|
||||
) -> Response {
|
||||
if state.is_ready() {
|
||||
if !state.is_ready() {
|
||||
return AppError::from_status(StatusCode::SERVICE_UNAVAILABLE)
|
||||
.with_message("api-server 正在退出,不再接收新流量")
|
||||
.with_details(json!({
|
||||
"reason": "api_server_draining",
|
||||
"ready": false,
|
||||
}))
|
||||
.into_response_with_context(Some(&request_context));
|
||||
}
|
||||
|
||||
let spacetime_health = state.spacetime_health_check().await;
|
||||
if spacetime_health.ok {
|
||||
return json_success_body(
|
||||
Some(&request_context),
|
||||
json!({
|
||||
"ok": true,
|
||||
"ready": true,
|
||||
"service": "genarrative-api-server",
|
||||
"spacetime": spacetime_health_to_json(&spacetime_health),
|
||||
}),
|
||||
)
|
||||
.into_response();
|
||||
}
|
||||
|
||||
AppError::from_status(StatusCode::SERVICE_UNAVAILABLE)
|
||||
.with_message("api-server 正在退出,不再接收新流量")
|
||||
.with_message("SpacetimeDB 连接健康检查失败,api-server 暂不接收新流量")
|
||||
.with_details(json!({
|
||||
"reason": "api_server_draining",
|
||||
"reason": "spacetime_unhealthy",
|
||||
"ready": false,
|
||||
"spacetime": spacetime_health_to_json(&spacetime_health),
|
||||
}))
|
||||
.into_response_with_context(Some(&request_context))
|
||||
}
|
||||
|
||||
fn spacetime_health_to_json(snapshot: &SpacetimeClientHealthSnapshot) -> Value {
|
||||
json!({
|
||||
"ok": snapshot.ok,
|
||||
"stage": snapshot.stage.as_str(),
|
||||
"checkedAtMicros": snapshot.checked_at_micros,
|
||||
"elapsedMs": snapshot.elapsed_ms,
|
||||
"timeoutMs": snapshot.timeout_ms,
|
||||
"error": snapshot.error,
|
||||
"lastSuccessAtMicros": snapshot.last_success_at_micros,
|
||||
"lastError": snapshot.last_error,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -31,7 +31,9 @@ use platform_wechat::{WechatClient, WechatConfig, pay::WechatPayClient};
|
||||
use serde_json::Value;
|
||||
use shared_contracts::creation_entry_config::CreationEntryConfigResponse;
|
||||
use shared_contracts::creative_agent::CreativeAgentSessionSnapshot;
|
||||
use spacetime_client::{SpacetimeClient, SpacetimeClientConfig, SpacetimeClientError};
|
||||
use spacetime_client::{
|
||||
SpacetimeClient, SpacetimeClientConfig, SpacetimeClientError, SpacetimeClientHealthSnapshot,
|
||||
};
|
||||
use time::OffsetDateTime;
|
||||
use tokio::sync::{Semaphore, broadcast};
|
||||
use tracing::{info, warn};
|
||||
@@ -242,6 +244,8 @@ pub struct AppStateInner {
|
||||
refresh_cookie_config: RefreshCookieConfig,
|
||||
#[cfg(test)]
|
||||
test_creation_entry_config: Arc<Mutex<Option<CreationEntryConfigResponse>>>,
|
||||
#[cfg(test)]
|
||||
test_spacetime_health: Arc<Mutex<Option<SpacetimeClientHealthSnapshot>>>,
|
||||
oss_client: Option<OssClient>,
|
||||
#[cfg_attr(test, allow(dead_code))]
|
||||
auth_store: InMemoryAuthStore,
|
||||
@@ -418,6 +422,10 @@ impl AppState {
|
||||
test_creation_entry_config: Arc::new(Mutex::new(Some(
|
||||
crate::creation_entry_config::test_creation_entry_config_response(),
|
||||
))),
|
||||
#[cfg(test)]
|
||||
test_spacetime_health: Arc::new(Mutex::new(Some(
|
||||
SpacetimeClientHealthSnapshot::healthy_for_test(),
|
||||
))),
|
||||
oss_client,
|
||||
auth_store,
|
||||
password_entry_service,
|
||||
@@ -467,6 +475,30 @@ impl AppState {
|
||||
self.ready.store(false, Ordering::Release);
|
||||
}
|
||||
|
||||
pub async fn spacetime_health_check(&self) -> SpacetimeClientHealthSnapshot {
|
||||
#[cfg(test)]
|
||||
if let Some(snapshot) = self
|
||||
.test_spacetime_health
|
||||
.lock()
|
||||
.expect("test spacetime health should lock")
|
||||
.clone()
|
||||
{
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
self.spacetime_client
|
||||
.health_check(self.config.spacetime_health_check_timeout)
|
||||
.await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn set_test_spacetime_health(&self, snapshot: SpacetimeClientHealthSnapshot) {
|
||||
*self
|
||||
.test_spacetime_health
|
||||
.lock()
|
||||
.expect("test spacetime health should lock") = Some(snapshot);
|
||||
}
|
||||
|
||||
pub async fn upsert_creation_entry_type_config(
|
||||
&self,
|
||||
input: module_runtime::CreationEntryTypeAdminUpsertInput,
|
||||
|
||||
Reference in New Issue
Block a user