perf(api-server): tune gallery load shedding

This commit is contained in:
kdletters
2026-05-19 01:00:33 +08:00
parent 3eb292b403
commit 8038b6a6ee
22 changed files with 1178 additions and 80 deletions

View File

@@ -1,4 +1,7 @@
use std::sync::OnceLock;
use std::{
sync::{Mutex, OnceLock},
time::Instant,
};
use opentelemetry::global;
use tracing::warn;
@@ -52,6 +55,38 @@ fn register_process_metrics_once() {
})
.build();
meter
.f64_observable_counter("process.cpu.time")
.with_unit("s")
.with_description("api-server total user plus system CPU time")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(cpu_time_seconds) = snapshot.cpu_time_seconds {
observer.observe(cpu_time_seconds, &[]);
}
})
.build();
meter
.f64_observable_gauge("genarrative.process.cpu.usage_percent")
.with_unit("%")
.with_description("api-server process CPU usage between metric collections")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(cpu_time_seconds) = snapshot.cpu_time_seconds {
if let Some(usage_percent) =
process_cpu_usage_percent(cpu_time_seconds, Instant::now())
{
observer.observe(usage_percent, &[]);
}
}
})
.build();
meter
.i64_observable_up_down_counter("process.thread.count")
.with_unit("{thread}")
@@ -97,11 +132,12 @@ fn to_i64(value: u64) -> i64 {
value.min(i64::MAX as u64) as i64
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Clone, Copy, PartialEq)]
struct ProcessMetricsSnapshot {
rss_bytes: u64,
private_bytes: Option<u64>,
virtual_bytes: Option<u64>,
cpu_time_seconds: Option<f64>,
thread_count: u64,
windows_handle_count: Option<u64>,
unix_fd_count: Option<u64>,
@@ -111,12 +147,56 @@ impl ProcessMetricsSnapshot {
fn collect() -> Option<Self> {
collect_process_metrics()
.inspect_err(|error| {
warn!(%error, "采集 api-server 进程内存指标失败");
warn!(%error, "采集 api-server 进程指标失败");
})
.ok()
}
}
#[derive(Debug, Clone, Copy)]
struct CpuUsageSample {
cpu_time_seconds: f64,
observed_at: Instant,
}
fn process_cpu_usage_percent(cpu_time_seconds: f64, observed_at: Instant) -> Option<f64> {
static LAST_SAMPLE: OnceLock<Mutex<Option<CpuUsageSample>>> = OnceLock::new();
let mut last_sample = LAST_SAMPLE.get_or_init(|| Mutex::new(None)).lock().ok()?;
let previous = *last_sample;
*last_sample = Some(CpuUsageSample {
cpu_time_seconds,
observed_at,
});
let previous = previous?;
let wall_delta_seconds = observed_at
.checked_duration_since(previous.observed_at)?
.as_secs_f64();
cpu_usage_ratio_between_samples(
previous.cpu_time_seconds,
cpu_time_seconds,
0.0,
wall_delta_seconds,
)
.map(|ratio| ratio * 100.0)
}
fn cpu_usage_ratio_between_samples(
previous_cpu_seconds: f64,
current_cpu_seconds: f64,
previous_wall_seconds: f64,
current_wall_seconds: f64,
) -> Option<f64> {
let cpu_delta_seconds = current_cpu_seconds - previous_cpu_seconds;
let wall_delta_seconds = current_wall_seconds - previous_wall_seconds;
if cpu_delta_seconds < 0.0 || wall_delta_seconds <= 0.0 {
return None;
}
Some(cpu_delta_seconds / wall_delta_seconds)
}
#[cfg(windows)]
fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
use windows_sys::Win32::{
@@ -149,16 +229,52 @@ fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
Some(u64::from(handle_count))
};
let cpu_time_seconds = windows_process_cpu_time_seconds(handle);
Ok(ProcessMetricsSnapshot {
rss_bytes: counters.WorkingSetSize as u64,
private_bytes: Some(counters.PrivateUsage as u64),
virtual_bytes: Some(counters.PrivateUsage as u64),
cpu_time_seconds,
thread_count: u64::from(unsafe { GetCurrentProcessId() }.thread_count()?),
windows_handle_count: handle_count,
unix_fd_count: None,
})
}
#[cfg(windows)]
fn windows_process_cpu_time_seconds(handle: windows_sys::Win32::Foundation::HANDLE) -> Option<f64> {
use windows_sys::Win32::{
Foundation::FILETIME,
System::Threading::GetProcessTimes,
};
let mut creation_time = FILETIME::default();
let mut exit_time = FILETIME::default();
let mut kernel_time = FILETIME::default();
let mut user_time = FILETIME::default();
let ok = unsafe {
GetProcessTimes(
handle,
&mut creation_time,
&mut exit_time,
&mut kernel_time,
&mut user_time,
)
};
if ok == 0 {
return None;
}
let total_100ns = filetime_100ns(kernel_time) + filetime_100ns(user_time);
Some(total_100ns as f64 / 10_000_000.0)
}
#[cfg(windows)]
fn filetime_100ns(filetime: windows_sys::Win32::Foundation::FILETIME) -> u64 {
((filetime.dwHighDateTime as u64) << 32) | u64::from(filetime.dwLowDateTime)
}
#[cfg(windows)]
trait WindowsProcessThreadCount {
fn thread_count(self) -> Result<u32, String>;
@@ -207,6 +323,8 @@ fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
.map_err(|error| format!("read /proc/self/status failed: {error}"))?;
let statm = std::fs::read_to_string("/proc/self/statm")
.map_err(|error| format!("read /proc/self/statm failed: {error}"))?;
let stat = std::fs::read_to_string("/proc/self/stat")
.map_err(|error| format!("read /proc/self/stat failed: {error}"))?;
let page_size = linux_page_size_bytes()?;
let rss_bytes = parse_status_kb(&status, "VmRSS:")
@@ -218,6 +336,7 @@ fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
.or_else(|| parse_statm_pages(&statm, 0).map(|value| value * page_size))
.ok_or_else(|| "missing VmSize/statm size field".to_string())?;
let private_bytes = parse_status_kb(&status, "VmData:").map(|value| value * 1024);
let cpu_time_seconds = linux_cpu_time_seconds(&stat)?;
let thread_count = parse_status_u64(&status, "Threads:")
.ok_or_else(|| "missing Threads field".to_string())?;
@@ -225,12 +344,52 @@ fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
rss_bytes,
private_bytes,
virtual_bytes: Some(virtual_bytes),
cpu_time_seconds: Some(cpu_time_seconds),
thread_count,
windows_handle_count: None,
unix_fd_count: linux_fd_count(),
})
}
#[cfg(target_os = "linux")]
fn linux_cpu_time_seconds(stat: &str) -> Result<f64, String> {
let cpu_ticks = parse_linux_proc_stat_cpu_ticks(stat)
.ok_or_else(|| "missing /proc/self/stat utime/stime fields".to_string())?;
let ticks_per_second = linux_clock_ticks_per_second()?;
Ok(cpu_ticks as f64 / ticks_per_second as f64)
}
#[cfg(target_os = "linux")]
fn linux_clock_ticks_per_second() -> Result<u64, String> {
static CLOCK_TICKS_PER_SECOND: OnceLock<Result<u64, String>> = OnceLock::new();
CLOCK_TICKS_PER_SECOND
.get_or_init(|| {
let output = std::process::Command::new("getconf")
.arg("CLK_TCK")
.output()
.map_err(|error| format!("getconf CLK_TCK failed: {error}"))?;
if !output.status.success() {
return Err(format!("getconf CLK_TCK exited with {}", output.status));
}
let text = String::from_utf8(output.stdout)
.map_err(|error| format!("getconf CLK_TCK output is not utf8: {error}"))?;
text.trim()
.parse::<u64>()
.map_err(|error| format!("parse CLK_TCK failed: {error}"))
})
.clone()
}
#[cfg(target_os = "linux")]
fn parse_linux_proc_stat_cpu_ticks(stat: &str) -> Option<u64> {
let fields_after_comm = stat.rsplit_once(") ")?.1;
let mut fields = fields_after_comm.split_whitespace();
let utime = fields.nth(11)?.parse::<u64>().ok()?;
let stime = fields.next()?.parse::<u64>().ok()?;
Some(utime + stime)
}
#[cfg(target_os = "linux")]
fn linux_page_size_bytes() -> Result<u64, String> {
let output = std::process::Command::new("getconf")
@@ -282,8 +441,12 @@ fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
#[cfg(test)]
mod tests {
use super::cpu_usage_ratio_between_samples;
#[cfg(target_os = "linux")]
use super::{parse_statm_pages, parse_status_kb, parse_status_u64};
use super::{
parse_linux_proc_stat_cpu_ticks, parse_statm_pages, parse_status_kb, parse_status_u64,
};
#[cfg(target_os = "linux")]
#[test]
@@ -303,4 +466,28 @@ mod tests {
assert_eq!(parse_statm_pages("100 20 0 0 0 0 0", 1), Some(20));
assert_eq!(parse_statm_pages("100 20", 7), None);
}
#[cfg(target_os = "linux")]
#[test]
fn parses_linux_proc_stat_cpu_ticks_with_space_in_process_name() {
let stat = "123 (api server) S 1 2 3 4 5 6 7 8 9 10 120 30 0 0 20 0 18 0 12345";
assert_eq!(parse_linux_proc_stat_cpu_ticks(stat), Some(150));
}
#[test]
fn cpu_usage_ratio_uses_cpu_time_delta_over_wall_time() {
assert_eq!(
cpu_usage_ratio_between_samples(10.0, 12.5, 100.0, 101.0),
Some(2.5)
);
assert_eq!(
cpu_usage_ratio_between_samples(10.0, 9.0, 100.0, 101.0),
None
);
assert_eq!(
cpu_usage_ratio_between_samples(10.0, 11.0, 100.0, 100.0),
None
);
}
}