补充 release SpacetimeDB 健康检查与巡检防回退

增加 SpacetimeDB 阶段化健康检查与 /readyz 阶段输出
记录 procedure/reducer/read 失败的阶段和耗时
补充 release 健康巡检 systemd timer 与生产 ops 预检
同步 API 构建部署、provision 脚本和运维文档
This commit is contained in:
kdletters
2026-06-10 11:35:39 +08:00
parent 7aafb37f04
commit 9db467d23f
17 changed files with 1147 additions and 70 deletions

View File

@@ -105,8 +105,8 @@ pub mod auth;
pub mod bark_battle;
pub use bark_battle::{
BarkBattleDraftConfigUpsertRecordInput, BarkBattleDraftCreateRecordInput,
BarkBattleRunFinishRecordInput, BarkBattleRunStartRecordInput,
BarkBattleWorkDeleteRecordInput, BarkBattleWorkPublishRecordInput,
BarkBattleRunFinishRecordInput, BarkBattleRunStartRecordInput, BarkBattleWorkDeleteRecordInput,
BarkBattleWorkPublishRecordInput,
};
pub mod big_fish;
pub mod combat;
@@ -132,7 +132,7 @@ use std::{
sync::atomic::{AtomicBool, Ordering},
sync::{Arc, Mutex},
thread::JoinHandle,
time::Duration,
time::{Duration, Instant},
};
use module_ai::{
@@ -241,6 +241,7 @@ use tokio::{
sync::{OwnedSemaphorePermit, RwLock, Semaphore, oneshot},
time::timeout,
};
use tracing::warn;
use crate::module_bindings::*;
@@ -253,6 +254,60 @@ pub struct SpacetimeClientConfig {
pub procedure_timeout: Duration,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SpacetimeClientStage {
Ready,
PoolAcquire,
ConnectBuild,
ConnectHandshake,
ReadModelSubscribe,
ProcedureResult,
ReducerResult,
ReadCache,
}
impl SpacetimeClientStage {
pub fn as_str(self) -> &'static str {
match self {
Self::Ready => "ready",
Self::PoolAcquire => "pool_acquire",
Self::ConnectBuild => "connect_build",
Self::ConnectHandshake => "connect_handshake",
Self::ReadModelSubscribe => "read_model_subscribe",
Self::ProcedureResult => "procedure_result",
Self::ReducerResult => "reducer_result",
Self::ReadCache => "read_cache",
}
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct SpacetimeClientHealthSnapshot {
pub ok: bool,
pub stage: SpacetimeClientStage,
pub checked_at_micros: i64,
pub elapsed_ms: u64,
pub timeout_ms: u64,
pub error: Option<String>,
pub last_success_at_micros: Option<i64>,
pub last_error: Option<String>,
}
impl SpacetimeClientHealthSnapshot {
pub fn healthy_for_test() -> Self {
Self {
ok: true,
stage: SpacetimeClientStage::Ready,
checked_at_micros: current_unix_micros(),
elapsed_ms: 0,
timeout_ms: 0,
error: None,
last_success_at_micros: Some(current_unix_micros()),
last_error: None,
}
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct AuthStoreSnapshotRecord {
pub snapshot_json: Option<String>,
@@ -270,6 +325,7 @@ pub struct AuthStoreSnapshotImportRecord {
pub struct SpacetimeClient {
config: SpacetimeClientConfig,
pool: Arc<SpacetimeConnectionPool>,
health_state: Arc<RwLock<SpacetimeClientHealthState>>,
creation_entry_config_cache: Arc<RwLock<Option<CreationEntryConfigRecord>>>,
custom_world_gallery_legacy_sync_attempted: Arc<AtomicBool>,
}
@@ -296,6 +352,24 @@ struct SpacetimeConnectionPool {
permits: Arc<Semaphore>,
}
#[derive(Debug, Default)]
struct SpacetimeClientHealthState {
last_success_at_micros: Option<i64>,
last_error: Option<String>,
}
#[derive(Debug)]
struct SpacetimeStageError {
stage: SpacetimeClientStage,
error: SpacetimeClientError,
}
impl SpacetimeStageError {
fn new(stage: SpacetimeClientStage, error: SpacetimeClientError) -> Self {
Self { stage, error }
}
}
struct PooledConnectionSlot {
connection: Option<PooledConnection>,
in_use: bool,
@@ -341,6 +415,7 @@ impl SpacetimeClient {
Self {
config,
pool,
health_state: Arc::new(RwLock::new(SpacetimeClientHealthState::default())),
creation_entry_config_cache: Arc::new(RwLock::new(None)),
custom_world_gallery_legacy_sync_attempted: Arc::new(AtomicBool::new(false)),
}
@@ -354,29 +429,58 @@ impl SpacetimeClient {
where
T: Send + 'static,
{
let started_at = Instant::now();
let metrics_guard = telemetry::begin_procedure(procedure);
let (sender, receiver) = oneshot::channel();
let result_sender = Arc::new(Mutex::new(Some(sender)));
let final_result = match self.acquire_connection().await {
let final_result = match self
.acquire_connection_with_timeout(self.config.procedure_timeout)
.await
{
Ok(lease) => {
let result = if let Some(connection) = lease.connection.as_ref() {
let (result, failed_stage) = if let Some(connection) = lease.connection.as_ref() {
call(&connection.connection, result_sender.clone());
match timeout(self.config.procedure_timeout, receiver).await {
Ok(inner) => match inner {
Ok(value) => value,
Err(_) => Err(SpacetimeClientError::ConnectDropped),
let stage = SpacetimeClientStage::ProcedureResult;
(
match timeout(self.config.procedure_timeout, receiver).await {
Ok(inner) => match inner {
Ok(value) => value,
Err(_) => Err(SpacetimeClientError::ConnectDropped),
},
Err(_) => Err(Self::resolve_timeout_error(Some(connection), stage)),
},
Err(_) => Err(Self::resolve_timeout_error(Some(connection))),
}
stage,
)
} else {
Err(SpacetimeClientError::Runtime(
"SpacetimeDB 连接租约缺少连接".to_string(),
))
(
Err(SpacetimeClientError::Runtime(
"SpacetimeDB 连接租约缺少连接".to_string(),
)),
SpacetimeClientStage::ProcedureResult,
)
};
self.release_connection(lease).await;
if let Err(error) = &result {
log_spacetime_client_failure(
"procedure",
procedure,
failed_stage,
started_at,
error,
);
}
result
}
Err(error) => Err(error),
Err(error) => {
log_spacetime_client_failure(
"procedure",
procedure,
error.stage,
started_at,
&error.error,
);
Err(error.error)
}
};
metrics_guard.finish(&final_result);
@@ -388,29 +492,58 @@ impl SpacetimeClient {
procedure: &'static str,
call: impl FnOnce(&DbConnection, ReducerResultSender) + Send + 'static,
) -> Result<(), SpacetimeClientError> {
let started_at = Instant::now();
let metrics_guard = telemetry::begin_procedure(procedure);
let (sender, receiver) = oneshot::channel();
let result_sender = Arc::new(Mutex::new(Some(sender)));
let final_result = match self.acquire_connection().await {
let final_result = match self
.acquire_connection_with_timeout(self.config.procedure_timeout)
.await
{
Ok(lease) => {
let result = if let Some(connection) = lease.connection.as_ref() {
let (result, failed_stage) = if let Some(connection) = lease.connection.as_ref() {
call(&connection.connection, result_sender.clone());
match timeout(self.config.procedure_timeout, receiver).await {
Ok(inner) => match inner {
Ok(value) => value,
Err(_) => Err(SpacetimeClientError::ConnectDropped),
let stage = SpacetimeClientStage::ReducerResult;
(
match timeout(self.config.procedure_timeout, receiver).await {
Ok(inner) => match inner {
Ok(value) => value,
Err(_) => Err(SpacetimeClientError::ConnectDropped),
},
Err(_) => Err(Self::resolve_timeout_error(Some(connection), stage)),
},
Err(_) => Err(Self::resolve_timeout_error(Some(connection))),
}
stage,
)
} else {
Err(SpacetimeClientError::Runtime(
"SpacetimeDB 连接租约缺少连接".to_string(),
))
(
Err(SpacetimeClientError::Runtime(
"SpacetimeDB 连接租约缺少连接".to_string(),
)),
SpacetimeClientStage::ReducerResult,
)
};
self.release_connection(lease).await;
if let Err(error) = &result {
log_spacetime_client_failure(
"reducer",
procedure,
failed_stage,
started_at,
error,
);
}
result
}
Err(error) => Err(error),
Err(error) => {
log_spacetime_client_failure(
"reducer",
procedure,
error.stage,
started_at,
&error.error,
);
Err(error.error)
}
};
metrics_guard.finish(&final_result);
@@ -425,11 +558,22 @@ impl SpacetimeClient {
where
T: Send + 'static,
{
let started_at = Instant::now();
let metrics_guard = telemetry::begin_read(read_name);
let lease = match self.acquire_connection().await {
let lease = match self
.acquire_connection_with_timeout(self.config.procedure_timeout)
.await
{
Ok(lease) => lease,
Err(error) => {
let final_result = Err(error);
log_spacetime_client_failure(
"read",
read_name,
error.stage,
started_at,
&error.error,
);
let final_result = Err(error.error);
metrics_guard.finish(&final_result);
return final_result;
}
@@ -443,6 +587,15 @@ impl SpacetimeClient {
};
self.release_connection(lease).await;
if let Err(error) = &final_result {
log_spacetime_client_failure(
"read",
read_name,
SpacetimeClientStage::ReadCache,
started_at,
error,
);
}
metrics_guard.finish(&final_result);
final_result
}
@@ -455,14 +608,75 @@ impl SpacetimeClient {
self.creation_entry_config_cache.read().await.clone()
}
async fn acquire_connection(&self) -> Result<PooledConnectionLease, SpacetimeClientError> {
let permit = timeout(
self.config.procedure_timeout,
self.pool.permits.clone().acquire_owned(),
)
.await
.map_err(|_| SpacetimeClientError::Timeout)?
.map_err(|error| SpacetimeClientError::Runtime(error.to_string()))?;
pub async fn health_check(&self, probe_timeout: Duration) -> SpacetimeClientHealthSnapshot {
let timeout = if probe_timeout.is_zero() {
DEFAULT_PROCEDURE_TIMEOUT
} else {
probe_timeout
};
let started_at = Instant::now();
let checked_at_micros = current_unix_micros();
let result = self.acquire_connection_with_timeout(timeout).await;
match result {
Ok(lease) => {
self.release_connection(lease).await;
let mut health_state = self.health_state.write().await;
health_state.last_success_at_micros = Some(checked_at_micros);
health_state.last_error = None;
SpacetimeClientHealthSnapshot {
ok: true,
stage: SpacetimeClientStage::Ready,
checked_at_micros,
elapsed_ms: duration_millis_u64(started_at.elapsed()),
timeout_ms: duration_millis_u64(timeout),
error: None,
last_success_at_micros: health_state.last_success_at_micros,
last_error: health_state.last_error.clone(),
}
}
Err(error) => {
log_spacetime_client_failure(
"health_check",
"spacetime_connection",
error.stage,
started_at,
&error.error,
);
let mut health_state = self.health_state.write().await;
let error_message = error.error.to_string();
health_state.last_error = Some(error_message.clone());
SpacetimeClientHealthSnapshot {
ok: false,
stage: error.stage,
checked_at_micros,
elapsed_ms: duration_millis_u64(started_at.elapsed()),
timeout_ms: duration_millis_u64(timeout),
error: Some(error_message),
last_success_at_micros: health_state.last_success_at_micros,
last_error: health_state.last_error.clone(),
}
}
}
}
async fn acquire_connection_with_timeout(
&self,
operation_timeout: Duration,
) -> Result<PooledConnectionLease, SpacetimeStageError> {
let permit = timeout(operation_timeout, self.pool.permits.clone().acquire_owned())
.await
.map_err(|_| {
SpacetimeStageError::new(
SpacetimeClientStage::PoolAcquire,
SpacetimeClientError::Timeout,
)
})?
.map_err(|error| {
SpacetimeStageError::new(
SpacetimeClientStage::PoolAcquire,
SpacetimeClientError::Runtime(error.to_string()),
)
})?;
loop {
for (slot_index, slot) in self.pool.slots.iter().enumerate() {
@@ -480,7 +694,7 @@ impl SpacetimeClient {
let connection = if let Some(connection) = reusable_connection {
connection
} else {
match self.build_pooled_connection().await {
match self.build_pooled_connection(operation_timeout).await {
Ok(connection) => connection,
Err(error) => {
let mut slot_guard = self.pool.slots[slot_index].lock().await;
@@ -502,7 +716,10 @@ impl SpacetimeClient {
}
}
async fn build_pooled_connection(&self) -> Result<PooledConnection, SpacetimeClientError> {
async fn build_pooled_connection(
&self,
operation_timeout: Duration,
) -> Result<PooledConnection, SpacetimeStageError> {
let config = self.config.clone();
let broken = Arc::new(AtomicBool::new(false));
let (sender, receiver) = oneshot::channel::<Result<(), SpacetimeClientError>>();
@@ -510,7 +727,7 @@ impl SpacetimeClient {
let broken_flag = broken.clone();
let disconnect_sender = connect_sender.clone();
let connection = timeout(
self.config.procedure_timeout,
operation_timeout,
tokio::task::spawn_blocking(move || {
DbConnection::builder()
.with_uri(config.server_url)
@@ -534,17 +751,41 @@ impl SpacetimeClient {
}),
)
.await
.map_err(|_| SpacetimeClientError::Timeout)?
.map_err(|error| SpacetimeClientError::Runtime(error.to_string()))??;
.map_err(|_| {
SpacetimeStageError::new(
SpacetimeClientStage::ConnectBuild,
SpacetimeClientError::Timeout,
)
})?
.map_err(|error| {
SpacetimeStageError::new(
SpacetimeClientStage::ConnectBuild,
SpacetimeClientError::Runtime(error.to_string()),
)
})?
.map_err(|error| SpacetimeStageError::new(SpacetimeClientStage::ConnectBuild, error))?;
let runner = connection.run_threaded();
timeout(self.config.procedure_timeout, receiver)
timeout(operation_timeout, receiver)
.await
.map_err(|_| SpacetimeClientError::Timeout)?
.map_err(|_| SpacetimeClientError::ConnectDropped)??;
.map_err(|_| {
SpacetimeStageError::new(
SpacetimeClientStage::ConnectHandshake,
SpacetimeClientError::Timeout,
)
})?
.map_err(|_| {
SpacetimeStageError::new(
SpacetimeClientStage::ConnectHandshake,
SpacetimeClientError::ConnectDropped,
)
})?
.map_err(|error| {
SpacetimeStageError::new(SpacetimeClientStage::ConnectHandshake, error)
})?;
let read_model_subscriptions = self
.subscribe_cached_read_models(&connection, broken.clone())
.subscribe_cached_read_models(&connection, broken.clone(), operation_timeout)
.await?;
Ok(PooledConnection {
@@ -559,7 +800,8 @@ impl SpacetimeClient {
&self,
connection: &DbConnection,
broken: Arc<AtomicBool>,
) -> Result<Vec<SubscriptionHandle>, SpacetimeClientError> {
operation_timeout: Duration,
) -> Result<Vec<SubscriptionHandle>, SpacetimeStageError> {
let mut subscriptions = Vec::new();
for query in [
"SELECT * FROM public_work_gallery_entry",
@@ -576,7 +818,13 @@ impl SpacetimeClient {
"SELECT * FROM big_fish_gallery_view",
] {
let subscription = self
.subscribe_cached_read_model_query(connection, broken.clone(), query, true)
.subscribe_cached_read_model_query(
connection,
broken.clone(),
query,
true,
operation_timeout,
)
.await?;
subscriptions.push(subscription);
}
@@ -597,7 +845,13 @@ impl SpacetimeClient {
"SELECT * FROM asset_object",
] {
if let Ok(subscription) = self
.subscribe_cached_read_model_query(connection, broken.clone(), query, false)
.subscribe_cached_read_model_query(
connection,
broken.clone(),
query,
false,
operation_timeout,
)
.await
{
subscriptions.push(subscription);
@@ -613,7 +867,8 @@ impl SpacetimeClient {
broken: Arc<AtomicBool>,
query: &'static str,
mark_broken_on_error: bool,
) -> Result<SubscriptionHandle, SpacetimeClientError> {
operation_timeout: Duration,
) -> Result<SubscriptionHandle, SpacetimeStageError> {
let (sender, receiver) = oneshot::channel::<Result<(), SpacetimeClientError>>();
let applied_sender = Arc::new(Mutex::new(Some(sender)));
let on_applied_sender = applied_sender.clone();
@@ -635,10 +890,23 @@ impl SpacetimeClient {
})
.subscribe(query);
timeout(self.config.procedure_timeout, receiver)
timeout(operation_timeout, receiver)
.await
.map_err(|_| SpacetimeClientError::Timeout)?
.map_err(|_| SpacetimeClientError::ConnectDropped)??;
.map_err(|_| {
SpacetimeStageError::new(
SpacetimeClientStage::ReadModelSubscribe,
SpacetimeClientError::Timeout,
)
})?
.map_err(|_| {
SpacetimeStageError::new(
SpacetimeClientStage::ReadModelSubscribe,
SpacetimeClientError::ConnectDropped,
)
})?
.map_err(|error| {
SpacetimeStageError::new(SpacetimeClientStage::ReadModelSubscribe, error)
})?;
Ok(subscription)
}
@@ -658,7 +926,10 @@ impl SpacetimeClient {
}
// 超时后必须统一归还租约;若连接已先一步断开则回传断线,否则标记坏连接并回传超时。
fn resolve_timeout_error(connection: Option<&PooledConnection>) -> SpacetimeClientError {
fn resolve_timeout_error(
connection: Option<&PooledConnection>,
_stage: SpacetimeClientStage,
) -> SpacetimeClientError {
if let Some(connection) = connection {
if connection.is_broken() {
return SpacetimeClientError::ConnectDropped;
@@ -681,6 +952,27 @@ fn current_public_work_day() -> i64 {
current_unix_micros().div_euclid(PUBLIC_WORK_PLAY_DAY_MICROS)
}
fn duration_millis_u64(duration: Duration) -> u64 {
duration.as_millis().min(u64::MAX as u128) as u64
}
fn log_spacetime_client_failure(
operation_kind: &'static str,
operation_name: &'static str,
stage: SpacetimeClientStage,
started_at: Instant,
error: &SpacetimeClientError,
) {
warn!(
operation_kind,
operation_name,
spacetime_stage = stage.as_str(),
elapsed_ms = duration_millis_u64(started_at.elapsed()),
error = %error,
"SpacetimeDB client operation failed"
);
}
fn public_work_recent_play_counts(
connection: &DbConnection,
source_type: &str,