feat: add graceful api shutdown readiness
This commit is contained in:
@@ -99,25 +99,35 @@ use shared_logging::{OtelConfig, init_tracing};
|
||||
use socket2::{Domain, Protocol, Socket, Type};
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
env, fs, io,
|
||||
env, fs, future, io,
|
||||
net::{SocketAddr, TcpListener as StdTcpListener},
|
||||
panic, thread,
|
||||
panic,
|
||||
sync::Arc,
|
||||
thread,
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::runtime::Builder as TokioRuntimeBuilder;
|
||||
use tokio::time::timeout;
|
||||
use tracing::{error, info};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::{
|
||||
app::{build_router, build_spacetime_unavailable_router},
|
||||
config::AppConfig,
|
||||
state::{AppState, AppStateInitError},
|
||||
tracking_outbox::TrackingOutbox,
|
||||
};
|
||||
|
||||
const API_SERVER_STARTUP_STACK_SIZE_BYTES: usize = 32 * 1024 * 1024;
|
||||
const AUTH_STORE_STARTUP_RESTORE_TIMEOUT: Duration = Duration::from_secs(8);
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ShutdownContext {
|
||||
app_state: Option<AppState>,
|
||||
tracking_outbox: Option<Arc<TrackingOutbox>>,
|
||||
outbox_flush_timeout: Duration,
|
||||
}
|
||||
|
||||
fn main() -> Result<(), io::Error> {
|
||||
// Windows 本地调试下 Axum 路由树和启动恢复链较重,显式放大启动线程栈,避免 debug 构建在进入监听前栈溢出。
|
||||
let server_thread = thread::Builder::new()
|
||||
@@ -158,19 +168,33 @@ async fn run_server(config: AppConfig) -> Result<(), io::Error> {
|
||||
let listen_backlog = config.listen_backlog;
|
||||
let worker_threads = config.worker_threads;
|
||||
let otel_enabled = config.otel_enabled;
|
||||
let outbox_flush_timeout = config.shutdown_outbox_flush_timeout;
|
||||
let listener = build_tcp_listener(bind_address, listen_backlog)?;
|
||||
|
||||
let router = match restore_app_state_for_startup(config).await {
|
||||
let (router, shutdown_context) = match restore_app_state_for_startup(config).await {
|
||||
Ok(state) => {
|
||||
state.puzzle_gallery_cache().spawn_cleanup_task();
|
||||
if let Some(outbox) = state.tracking_outbox() {
|
||||
let tracking_outbox = state.tracking_outbox();
|
||||
if let Some(outbox) = tracking_outbox.clone() {
|
||||
outbox.spawn_worker();
|
||||
}
|
||||
build_router(state)
|
||||
}
|
||||
Err(AppStateInitError::DependencyUnavailable(message)) => {
|
||||
build_spacetime_unavailable_router(message)
|
||||
(
|
||||
build_router(state.clone()),
|
||||
ShutdownContext {
|
||||
app_state: Some(state),
|
||||
tracking_outbox,
|
||||
outbox_flush_timeout,
|
||||
},
|
||||
)
|
||||
}
|
||||
Err(AppStateInitError::DependencyUnavailable(message)) => (
|
||||
build_spacetime_unavailable_router(message),
|
||||
ShutdownContext {
|
||||
app_state: None,
|
||||
tracking_outbox: None,
|
||||
outbox_flush_timeout,
|
||||
},
|
||||
),
|
||||
Err(error) => {
|
||||
return Err(std::io::Error::other(format!(
|
||||
"初始化应用状态失败:{error}"
|
||||
@@ -186,7 +210,98 @@ async fn run_server(config: AppConfig) -> Result<(), io::Error> {
|
||||
"api-server 已完成 tracing 初始化并开始监听"
|
||||
);
|
||||
|
||||
axum::serve(listener, router).await
|
||||
let result = axum::serve(listener, router)
|
||||
.with_graceful_shutdown(shutdown_signal(shutdown_context.clone()))
|
||||
.await;
|
||||
finalize_shutdown(shutdown_context).await;
|
||||
result
|
||||
}
|
||||
|
||||
async fn shutdown_signal(context: ShutdownContext) {
|
||||
let signal = wait_for_shutdown_signal().await;
|
||||
if let Some(state) = context.app_state.as_ref() {
|
||||
state.mark_not_ready();
|
||||
}
|
||||
info!(
|
||||
signal,
|
||||
"api-server 收到退出信号,已标记 readiness 不可用并开始排空 HTTP 请求"
|
||||
);
|
||||
}
|
||||
|
||||
async fn wait_for_shutdown_signal() -> &'static str {
|
||||
#[cfg(unix)]
|
||||
{
|
||||
tokio::select! {
|
||||
signal = wait_for_ctrl_c_signal() => signal,
|
||||
signal = wait_for_sigterm_signal() => signal,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(unix))]
|
||||
{
|
||||
wait_for_ctrl_c_signal().await
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_ctrl_c_signal() -> &'static str {
|
||||
if let Err(error) = tokio::signal::ctrl_c().await {
|
||||
error!(error = %error, "监听 SIGINT 失败,无法通过 Ctrl-C 触发优雅退出");
|
||||
future::pending::<()>().await;
|
||||
}
|
||||
"sigint"
|
||||
}
|
||||
|
||||
#[cfg(unix)]
|
||||
async fn wait_for_sigterm_signal() -> &'static str {
|
||||
let mut signal = match tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())
|
||||
{
|
||||
Ok(signal) => signal,
|
||||
Err(error) => {
|
||||
error!(error = %error, "监听 SIGTERM 失败,无法通过 systemd terminate 触发优雅退出");
|
||||
future::pending::<()>().await;
|
||||
unreachable!("pending future never returns");
|
||||
}
|
||||
};
|
||||
signal.recv().await;
|
||||
"sigterm"
|
||||
}
|
||||
|
||||
async fn finalize_shutdown(context: ShutdownContext) {
|
||||
if let Some(state) = context.app_state.as_ref() {
|
||||
state.mark_not_ready();
|
||||
}
|
||||
|
||||
let Some(outbox) = context.tracking_outbox else {
|
||||
return;
|
||||
};
|
||||
|
||||
if context.outbox_flush_timeout.is_zero() {
|
||||
warn!("api-server 退出时 tracking outbox flush timeout 为 0,跳过主动 flush");
|
||||
return;
|
||||
}
|
||||
|
||||
let timeout_ms = context
|
||||
.outbox_flush_timeout
|
||||
.as_millis()
|
||||
.min(u128::from(u64::MAX)) as u64;
|
||||
info!(timeout_ms, "api-server 退出前封存并 flush tracking outbox");
|
||||
match timeout(context.outbox_flush_timeout, outbox.flush_for_shutdown()).await {
|
||||
Ok(Ok(())) => {
|
||||
info!("api-server 退出前 tracking outbox flush 完成");
|
||||
}
|
||||
Ok(Err(error)) => {
|
||||
warn!(
|
||||
error = %error,
|
||||
"api-server 退出前 tracking outbox flush 未完成,已保留本地文件等待下次启动重试"
|
||||
);
|
||||
}
|
||||
Err(_) => {
|
||||
warn!(
|
||||
timeout_ms,
|
||||
"api-server 退出前 tracking outbox flush 超时,已保留本地文件等待下次启动重试"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn build_tcp_listener(
|
||||
|
||||
Reference in New Issue
Block a user