feat: add graceful api shutdown readiness

This commit is contained in:
2026-06-05 11:43:56 +08:00
parent 9ab353926e
commit e5592304a5
19 changed files with 321 additions and 35 deletions

View File

@@ -99,25 +99,35 @@ use shared_logging::{OtelConfig, init_tracing};
use socket2::{Domain, Protocol, Socket, Type};
use std::{
collections::HashSet,
env, fs, io,
env, fs, future, io,
net::{SocketAddr, TcpListener as StdTcpListener},
panic, thread,
panic,
sync::Arc,
thread,
time::Duration,
};
use tokio::net::TcpListener;
use tokio::runtime::Builder as TokioRuntimeBuilder;
use tokio::time::timeout;
use tracing::{error, info};
use tracing::{error, info, warn};
use crate::{
app::{build_router, build_spacetime_unavailable_router},
config::AppConfig,
state::{AppState, AppStateInitError},
tracking_outbox::TrackingOutbox,
};
const API_SERVER_STARTUP_STACK_SIZE_BYTES: usize = 32 * 1024 * 1024;
const AUTH_STORE_STARTUP_RESTORE_TIMEOUT: Duration = Duration::from_secs(8);
#[derive(Clone)]
struct ShutdownContext {
app_state: Option<AppState>,
tracking_outbox: Option<Arc<TrackingOutbox>>,
outbox_flush_timeout: Duration,
}
fn main() -> Result<(), io::Error> {
// Windows 本地调试下 Axum 路由树和启动恢复链较重,显式放大启动线程栈,避免 debug 构建在进入监听前栈溢出。
let server_thread = thread::Builder::new()
@@ -158,19 +168,33 @@ async fn run_server(config: AppConfig) -> Result<(), io::Error> {
let listen_backlog = config.listen_backlog;
let worker_threads = config.worker_threads;
let otel_enabled = config.otel_enabled;
let outbox_flush_timeout = config.shutdown_outbox_flush_timeout;
let listener = build_tcp_listener(bind_address, listen_backlog)?;
let router = match restore_app_state_for_startup(config).await {
let (router, shutdown_context) = match restore_app_state_for_startup(config).await {
Ok(state) => {
state.puzzle_gallery_cache().spawn_cleanup_task();
if let Some(outbox) = state.tracking_outbox() {
let tracking_outbox = state.tracking_outbox();
if let Some(outbox) = tracking_outbox.clone() {
outbox.spawn_worker();
}
build_router(state)
}
Err(AppStateInitError::DependencyUnavailable(message)) => {
build_spacetime_unavailable_router(message)
(
build_router(state.clone()),
ShutdownContext {
app_state: Some(state),
tracking_outbox,
outbox_flush_timeout,
},
)
}
Err(AppStateInitError::DependencyUnavailable(message)) => (
build_spacetime_unavailable_router(message),
ShutdownContext {
app_state: None,
tracking_outbox: None,
outbox_flush_timeout,
},
),
Err(error) => {
return Err(std::io::Error::other(format!(
"初始化应用状态失败:{error}"
@@ -186,7 +210,98 @@ async fn run_server(config: AppConfig) -> Result<(), io::Error> {
"api-server 已完成 tracing 初始化并开始监听"
);
axum::serve(listener, router).await
let result = axum::serve(listener, router)
.with_graceful_shutdown(shutdown_signal(shutdown_context.clone()))
.await;
finalize_shutdown(shutdown_context).await;
result
}
async fn shutdown_signal(context: ShutdownContext) {
let signal = wait_for_shutdown_signal().await;
if let Some(state) = context.app_state.as_ref() {
state.mark_not_ready();
}
info!(
signal,
"api-server 收到退出信号,已标记 readiness 不可用并开始排空 HTTP 请求"
);
}
async fn wait_for_shutdown_signal() -> &'static str {
#[cfg(unix)]
{
tokio::select! {
signal = wait_for_ctrl_c_signal() => signal,
signal = wait_for_sigterm_signal() => signal,
}
}
#[cfg(not(unix))]
{
wait_for_ctrl_c_signal().await
}
}
async fn wait_for_ctrl_c_signal() -> &'static str {
if let Err(error) = tokio::signal::ctrl_c().await {
error!(error = %error, "监听 SIGINT 失败,无法通过 Ctrl-C 触发优雅退出");
future::pending::<()>().await;
}
"sigint"
}
#[cfg(unix)]
async fn wait_for_sigterm_signal() -> &'static str {
let mut signal = match tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())
{
Ok(signal) => signal,
Err(error) => {
error!(error = %error, "监听 SIGTERM 失败,无法通过 systemd terminate 触发优雅退出");
future::pending::<()>().await;
unreachable!("pending future never returns");
}
};
signal.recv().await;
"sigterm"
}
async fn finalize_shutdown(context: ShutdownContext) {
if let Some(state) = context.app_state.as_ref() {
state.mark_not_ready();
}
let Some(outbox) = context.tracking_outbox else {
return;
};
if context.outbox_flush_timeout.is_zero() {
warn!("api-server 退出时 tracking outbox flush timeout 为 0跳过主动 flush");
return;
}
let timeout_ms = context
.outbox_flush_timeout
.as_millis()
.min(u128::from(u64::MAX)) as u64;
info!(timeout_ms, "api-server 退出前封存并 flush tracking outbox");
match timeout(context.outbox_flush_timeout, outbox.flush_for_shutdown()).await {
Ok(Ok(())) => {
info!("api-server 退出前 tracking outbox flush 完成");
}
Ok(Err(error)) => {
warn!(
error = %error,
"api-server 退出前 tracking outbox flush 未完成,已保留本地文件等待下次启动重试"
);
}
Err(_) => {
warn!(
timeout_ms,
"api-server 退出前 tracking outbox flush 超时,已保留本地文件等待下次启动重试"
);
}
}
}
fn build_tcp_listener(