Files
Genarrative/server-rs/crates/api-server/src/process_metrics.rs

307 lines
10 KiB
Rust

use std::sync::OnceLock;
use opentelemetry::global;
use tracing::warn;
// 进程指标只描述 api-server 自身,不携带请求、用户或作品维度,避免 OTLP 指标高基数膨胀。
pub(crate) fn register_process_metrics() {
static REGISTERED: OnceLock<()> = OnceLock::new();
REGISTERED.get_or_init(register_process_metrics_once);
}
fn register_process_metrics_once() {
let meter = global::meter("genarrative-api");
meter
.i64_observable_up_down_counter("process.memory.usage")
.with_unit("By")
.with_description("api-server process physical memory usage")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
observer.observe(to_i64(snapshot.rss_bytes), &[]);
})
.build();
meter
.i64_observable_up_down_counter("process.memory.virtual")
.with_unit("By")
.with_description("api-server committed virtual memory")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(virtual_bytes) = snapshot.virtual_bytes {
observer.observe(to_i64(virtual_bytes), &[]);
}
})
.build();
meter
.i64_observable_up_down_counter("genarrative.process.memory.private")
.with_unit("By")
.with_description("api-server private memory for local diagnostics")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(private_bytes) = snapshot.private_bytes {
observer.observe(to_i64(private_bytes), &[]);
}
})
.build();
meter
.i64_observable_up_down_counter("process.thread.count")
.with_unit("{thread}")
.with_description("api-server process thread count")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
observer.observe(to_i64(snapshot.thread_count), &[]);
})
.build();
meter
.i64_observable_up_down_counter("process.windows.handle.count")
.with_unit("{handle}")
.with_description("api-server process handle count on Windows")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(handle_count) = snapshot.windows_handle_count {
observer.observe(to_i64(handle_count), &[]);
}
})
.build();
meter
.i64_observable_up_down_counter("process.unix.file_descriptor.count")
.with_unit("{file_descriptor}")
.with_description("api-server process file descriptor count on Unix")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(fd_count) = snapshot.unix_fd_count {
observer.observe(to_i64(fd_count), &[]);
}
})
.build();
}
fn to_i64(value: u64) -> i64 {
value.min(i64::MAX as u64) as i64
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct ProcessMetricsSnapshot {
rss_bytes: u64,
private_bytes: Option<u64>,
virtual_bytes: Option<u64>,
thread_count: u64,
windows_handle_count: Option<u64>,
unix_fd_count: Option<u64>,
}
impl ProcessMetricsSnapshot {
fn collect() -> Option<Self> {
collect_process_metrics()
.inspect_err(|error| {
warn!(%error, "采集 api-server 进程内存指标失败");
})
.ok()
}
}
#[cfg(windows)]
fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
use windows_sys::Win32::{
System::{
ProcessStatus::{GetProcessMemoryInfo, PROCESS_MEMORY_COUNTERS_EX},
Threading::{GetCurrentProcess, GetCurrentProcessId, GetProcessHandleCount},
},
};
let handle = unsafe { GetCurrentProcess() };
let mut counters = PROCESS_MEMORY_COUNTERS_EX {
cb: std::mem::size_of::<PROCESS_MEMORY_COUNTERS_EX>() as u32,
..Default::default()
};
let ok = unsafe {
GetProcessMemoryInfo(
handle,
std::ptr::addr_of_mut!(counters).cast(),
counters.cb,
)
};
if ok == 0 {
return Err("GetProcessMemoryInfo returned false".to_string());
}
let mut handle_count = 0_u32;
let handle_count = if unsafe { GetProcessHandleCount(handle, &mut handle_count) } == 0 {
None
} else {
Some(u64::from(handle_count))
};
Ok(ProcessMetricsSnapshot {
rss_bytes: counters.WorkingSetSize as u64,
private_bytes: Some(counters.PrivateUsage as u64),
virtual_bytes: Some(counters.PrivateUsage as u64),
thread_count: u64::from(unsafe { GetCurrentProcessId() }.thread_count()?),
windows_handle_count: handle_count,
unix_fd_count: None,
})
}
#[cfg(windows)]
trait WindowsProcessThreadCount {
fn thread_count(self) -> Result<u32, String>;
}
#[cfg(windows)]
impl WindowsProcessThreadCount for u32 {
fn thread_count(self) -> Result<u32, String> {
use windows_sys::Win32::{
Foundation::{CloseHandle, INVALID_HANDLE_VALUE},
System::Diagnostics::ToolHelp::{
CreateToolhelp32Snapshot, PROCESSENTRY32, Process32First, Process32Next,
TH32CS_SNAPPROCESS,
},
};
let snapshot = unsafe { CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0) };
if snapshot == INVALID_HANDLE_VALUE {
return Err("CreateToolhelp32Snapshot returned INVALID_HANDLE_VALUE".to_string());
}
let mut entry = PROCESSENTRY32 {
dwSize: std::mem::size_of::<PROCESSENTRY32>() as u32,
..Default::default()
};
let mut found = None;
let mut ok = unsafe { Process32First(snapshot, &mut entry) };
while ok != 0 {
if entry.th32ProcessID == self {
found = Some(entry.cntThreads);
break;
}
ok = unsafe { Process32Next(snapshot, &mut entry) };
}
unsafe {
CloseHandle(snapshot);
}
found.ok_or_else(|| format!("process {self} not found in ToolHelp snapshot"))
}
}
#[cfg(target_os = "linux")]
fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
let status = std::fs::read_to_string("/proc/self/status")
.map_err(|error| format!("read /proc/self/status failed: {error}"))?;
let statm = std::fs::read_to_string("/proc/self/statm")
.map_err(|error| format!("read /proc/self/statm failed: {error}"))?;
let page_size = linux_page_size_bytes()?;
let rss_bytes = parse_status_kb(&status, "VmRSS:")
.map(|value| value * 1024)
.or_else(|| parse_statm_pages(&statm, 1).map(|value| value * page_size))
.ok_or_else(|| "missing VmRSS/statm resident field".to_string())?;
let virtual_bytes = parse_status_kb(&status, "VmSize:")
.map(|value| value * 1024)
.or_else(|| parse_statm_pages(&statm, 0).map(|value| value * page_size))
.ok_or_else(|| "missing VmSize/statm size field".to_string())?;
let private_bytes = parse_status_kb(&status, "VmData:").map(|value| value * 1024);
let thread_count = parse_status_u64(&status, "Threads:")
.ok_or_else(|| "missing Threads field".to_string())?;
Ok(ProcessMetricsSnapshot {
rss_bytes,
private_bytes,
virtual_bytes: Some(virtual_bytes),
thread_count,
windows_handle_count: None,
unix_fd_count: linux_fd_count(),
})
}
#[cfg(target_os = "linux")]
fn linux_page_size_bytes() -> Result<u64, String> {
let output = std::process::Command::new("getconf")
.arg("PAGESIZE")
.output()
.map_err(|error| format!("getconf PAGESIZE failed: {error}"))?;
if !output.status.success() {
return Err(format!("getconf PAGESIZE exited with {}", output.status));
}
let text = String::from_utf8(output.stdout)
.map_err(|error| format!("getconf PAGESIZE output is not utf8: {error}"))?;
text.trim()
.parse::<u64>()
.map_err(|error| format!("parse PAGESIZE failed: {error}"))
}
#[cfg(target_os = "linux")]
fn linux_fd_count() -> Option<u64> {
let entries = std::fs::read_dir("/proc/self/fd").ok()?;
Some(entries.filter_map(Result::ok).count() as u64)
}
#[cfg(target_os = "linux")]
fn parse_status_kb(status: &str, key: &str) -> Option<u64> {
parse_status_u64(status, key)
}
#[cfg(target_os = "linux")]
fn parse_status_u64(status: &str, key: &str) -> Option<u64> {
status.lines().find_map(|line| {
let rest = line.strip_prefix(key)?.trim();
rest.split_whitespace().next()?.parse::<u64>().ok()
})
}
#[cfg(target_os = "linux")]
fn parse_statm_pages(statm: &str, index: usize) -> Option<u64> {
statm
.split_whitespace()
.nth(index)?
.parse::<u64>()
.ok()
}
#[cfg(not(any(windows, target_os = "linux")))]
fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
Err("process metrics are only implemented for Windows and Linux".to_string())
}
#[cfg(test)]
mod tests {
#[cfg(target_os = "linux")]
use super::{parse_statm_pages, parse_status_kb, parse_status_u64};
#[cfg(target_os = "linux")]
#[test]
fn parses_linux_proc_status_memory_fields() {
let status = "Name:\tapi-server\nVmSize:\t 123456 kB\nVmRSS:\t 7890 kB\nVmData:\t 3456 kB\nThreads:\t37\n";
assert_eq!(parse_status_kb(status, "VmRSS:"), Some(7890));
assert_eq!(parse_status_kb(status, "VmSize:"), Some(123456));
assert_eq!(parse_status_kb(status, "VmData:"), Some(3456));
assert_eq!(parse_status_u64(status, "Threads:"), Some(37));
}
#[cfg(target_os = "linux")]
#[test]
fn parses_linux_statm_pages() {
assert_eq!(parse_statm_pages("100 20 0 0 0 0 0", 0), Some(100));
assert_eq!(parse_statm_pages("100 20 0 0 0 0 0", 1), Some(20));
assert_eq!(parse_statm_pages("100 20", 7), None);
}
}