feat(api-server): add container loadtest observability

This commit is contained in:
kdletters
2026-05-17 20:52:15 +08:00
parent 73f937d78a
commit 5a4a8a4892
36 changed files with 1325 additions and 30 deletions

View File

@@ -58,6 +58,9 @@ urlencoding = { workspace = true }
uuid = { workspace = true, features = ["v4"] }
zip = { workspace = true, features = ["deflate"] }
[target.'cfg(windows)'.dependencies]
windows-sys = { workspace = true, features = ["Win32_Foundation", "Win32_System_Diagnostics_ToolHelp", "Win32_System_ProcessStatus", "Win32_System_Threading"] }
[dev-dependencies]
base64 = { workspace = true }
hmac = { workspace = true }

View File

@@ -1,4 +1,13 @@
use axum::Json;
use std::convert::Infallible;
use axum::{
Json,
body::Body,
http::{HeaderValue, header},
response::{IntoResponse, Response},
};
use bytes::Bytes;
use futures_util::stream;
use serde::Serialize;
use serde_json::Value;
#[cfg(test)]
@@ -32,6 +41,30 @@ where
Json(serde_json::to_value(data).unwrap_or(Value::Null))
}
pub fn json_success_data_bytes_response(
request_context: Option<&RequestContext>,
data_json: Bytes,
) -> Response {
if let Some(context) = request_context
&& context.wants_envelope()
{
let meta = serde_json::to_vec(&build_api_response_meta(Some(context)))
.map(Bytes::from)
.unwrap_or_else(|_| Bytes::from_static(b"null"));
let chunks = [
Bytes::from_static(b"{\"ok\":true,\"data\":"),
data_json,
Bytes::from_static(b",\"error\":null,\"meta\":"),
meta,
Bytes::from_static(b"}"),
];
let stream = stream::iter(chunks.into_iter().map(Ok::<Bytes, Infallible>));
return json_body_response(Body::from_stream(stream));
}
json_bytes_response(data_json)
}
pub fn json_error_body(
request_context: Option<&RequestContext>,
error: &ApiErrorPayload,
@@ -65,6 +98,19 @@ fn build_api_response_meta(request_context: Option<&RequestContext>) -> ApiRespo
)
}
fn json_bytes_response(bytes: Bytes) -> Response {
json_body_response(Body::from(bytes))
}
fn json_body_response(body: Body) -> Response {
let mut response = body.into_response();
response.headers_mut().insert(
header::CONTENT_TYPE,
HeaderValue::from_static("application/json; charset=utf-8"),
);
response
}
#[cfg(test)]
mod tests {
use super::*;
@@ -106,6 +152,31 @@ mod tests {
assert!(body.get("meta").is_none());
}
#[tokio::test]
async fn success_response_streams_cached_data_inside_standard_envelope() {
use http_body_util::BodyExt;
let request_context = build_request_context(true);
let response = json_success_data_bytes_response(
Some(&request_context),
Bytes::from_static(br#"{"items":[]}"#),
);
let body = response
.into_body()
.collect()
.await
.expect("response body should collect")
.to_bytes();
let payload: Value = serde_json::from_slice(&body).expect("body should be json");
assert_eq!(payload["ok"], Value::Bool(true));
assert_eq!(payload["data"]["items"], Value::Array(Vec::new()));
assert_eq!(
payload["meta"]["requestId"],
Value::String("req-test".to_string())
);
}
#[test]
fn error_body_returns_legacy_shape_without_envelope_header() {
let request_context = build_request_context(false);

View File

@@ -37,13 +37,25 @@ pub async fn limit_concurrent_requests(
fn acquire_http_request_permit(
permit_pool: Arc<HttpRequestPermitPool>,
) -> Result<OwnedSemaphorePermit, TryAcquireError> {
permit_pool.try_acquire_owned()
) -> Result<HttpRequestPermitGuard, TryAcquireError> {
match permit_pool.clone().try_acquire_owned() {
Ok(permit) => {
crate::telemetry::update_http_request_permits_available(permit_pool.available_permits());
Ok(HttpRequestPermitGuard {
permit: Some(permit),
permit_pool,
})
}
Err(error) => {
crate::telemetry::update_http_request_permits_available(permit_pool.available_permits());
Err(error)
}
}
}
fn hold_permit_until_response_body_dropped(
response: Response,
permit: OwnedSemaphorePermit,
permit: HttpRequestPermitGuard,
) -> Response {
response.map(|body| {
Body::new(body.map_frame(move |frame| {
@@ -53,6 +65,18 @@ fn hold_permit_until_response_body_dropped(
})
}
struct HttpRequestPermitGuard {
permit: Option<OwnedSemaphorePermit>,
permit_pool: Arc<HttpRequestPermitPool>,
}
impl Drop for HttpRequestPermitGuard {
fn drop(&mut self) {
drop(self.permit.take());
crate::telemetry::update_http_request_permits_available(self.permit_pool.available_permits());
}
}
fn reject_overloaded_request(request: &Request<Body>) -> Response {
let request_context = request.extensions().get::<RequestContext>().cloned();
let mut response = AppError::from_status(StatusCode::TOO_MANY_REQUESTS)

View File

@@ -56,6 +56,7 @@ mod password_management;
mod phone_auth;
mod platform_errors;
mod profile_identity;
mod process_metrics;
mod prompt;
mod puzzle;
mod puzzle_agent_turn;
@@ -140,6 +141,8 @@ async fn run_server(config: AppConfig) -> Result<(), io::Error> {
enabled: config.otel_enabled,
},
)?;
process_metrics::register_process_metrics();
telemetry::register_http_runtime_metrics();
let bind_address = config.bind_socket_addr();
let listen_backlog = config.listen_backlog;

View File

@@ -0,0 +1,306 @@
use std::sync::OnceLock;
use opentelemetry::global;
use tracing::warn;
// 进程指标只描述 api-server 自身,不携带请求、用户或作品维度,避免 OTLP 指标高基数膨胀。
pub(crate) fn register_process_metrics() {
static REGISTERED: OnceLock<()> = OnceLock::new();
REGISTERED.get_or_init(register_process_metrics_once);
}
fn register_process_metrics_once() {
let meter = global::meter("genarrative-api");
meter
.i64_observable_up_down_counter("process.memory.usage")
.with_unit("By")
.with_description("api-server process physical memory usage")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
observer.observe(to_i64(snapshot.rss_bytes), &[]);
})
.build();
meter
.i64_observable_up_down_counter("process.memory.virtual")
.with_unit("By")
.with_description("api-server committed virtual memory")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(virtual_bytes) = snapshot.virtual_bytes {
observer.observe(to_i64(virtual_bytes), &[]);
}
})
.build();
meter
.i64_observable_up_down_counter("genarrative.process.memory.private")
.with_unit("By")
.with_description("api-server private memory for local diagnostics")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(private_bytes) = snapshot.private_bytes {
observer.observe(to_i64(private_bytes), &[]);
}
})
.build();
meter
.i64_observable_up_down_counter("process.thread.count")
.with_unit("{thread}")
.with_description("api-server process thread count")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
observer.observe(to_i64(snapshot.thread_count), &[]);
})
.build();
meter
.i64_observable_up_down_counter("process.windows.handle.count")
.with_unit("{handle}")
.with_description("api-server process handle count on Windows")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(handle_count) = snapshot.windows_handle_count {
observer.observe(to_i64(handle_count), &[]);
}
})
.build();
meter
.i64_observable_up_down_counter("process.unix.file_descriptor.count")
.with_unit("{file_descriptor}")
.with_description("api-server process file descriptor count on Unix")
.with_callback(|observer| {
let Some(snapshot) = ProcessMetricsSnapshot::collect() else {
return;
};
if let Some(fd_count) = snapshot.unix_fd_count {
observer.observe(to_i64(fd_count), &[]);
}
})
.build();
}
fn to_i64(value: u64) -> i64 {
value.min(i64::MAX as u64) as i64
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct ProcessMetricsSnapshot {
rss_bytes: u64,
private_bytes: Option<u64>,
virtual_bytes: Option<u64>,
thread_count: u64,
windows_handle_count: Option<u64>,
unix_fd_count: Option<u64>,
}
impl ProcessMetricsSnapshot {
fn collect() -> Option<Self> {
collect_process_metrics()
.inspect_err(|error| {
warn!(%error, "采集 api-server 进程内存指标失败");
})
.ok()
}
}
#[cfg(windows)]
fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
use windows_sys::Win32::{
System::{
ProcessStatus::{GetProcessMemoryInfo, PROCESS_MEMORY_COUNTERS_EX},
Threading::{GetCurrentProcess, GetCurrentProcessId, GetProcessHandleCount},
},
};
let handle = unsafe { GetCurrentProcess() };
let mut counters = PROCESS_MEMORY_COUNTERS_EX {
cb: std::mem::size_of::<PROCESS_MEMORY_COUNTERS_EX>() as u32,
..Default::default()
};
let ok = unsafe {
GetProcessMemoryInfo(
handle,
std::ptr::addr_of_mut!(counters).cast(),
counters.cb,
)
};
if ok == 0 {
return Err("GetProcessMemoryInfo returned false".to_string());
}
let mut handle_count = 0_u32;
let handle_count = if unsafe { GetProcessHandleCount(handle, &mut handle_count) } == 0 {
None
} else {
Some(u64::from(handle_count))
};
Ok(ProcessMetricsSnapshot {
rss_bytes: counters.WorkingSetSize as u64,
private_bytes: Some(counters.PrivateUsage as u64),
virtual_bytes: Some(counters.PrivateUsage as u64),
thread_count: u64::from(unsafe { GetCurrentProcessId() }.thread_count()?),
windows_handle_count: handle_count,
unix_fd_count: None,
})
}
#[cfg(windows)]
trait WindowsProcessThreadCount {
fn thread_count(self) -> Result<u32, String>;
}
#[cfg(windows)]
impl WindowsProcessThreadCount for u32 {
fn thread_count(self) -> Result<u32, String> {
use windows_sys::Win32::{
Foundation::{CloseHandle, INVALID_HANDLE_VALUE},
System::Diagnostics::ToolHelp::{
CreateToolhelp32Snapshot, PROCESSENTRY32, Process32First, Process32Next,
TH32CS_SNAPPROCESS,
},
};
let snapshot = unsafe { CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0) };
if snapshot == INVALID_HANDLE_VALUE {
return Err("CreateToolhelp32Snapshot returned INVALID_HANDLE_VALUE".to_string());
}
let mut entry = PROCESSENTRY32 {
dwSize: std::mem::size_of::<PROCESSENTRY32>() as u32,
..Default::default()
};
let mut found = None;
let mut ok = unsafe { Process32First(snapshot, &mut entry) };
while ok != 0 {
if entry.th32ProcessID == self {
found = Some(entry.cntThreads);
break;
}
ok = unsafe { Process32Next(snapshot, &mut entry) };
}
unsafe {
CloseHandle(snapshot);
}
found.ok_or_else(|| format!("process {self} not found in ToolHelp snapshot"))
}
}
#[cfg(target_os = "linux")]
fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
let status = std::fs::read_to_string("/proc/self/status")
.map_err(|error| format!("read /proc/self/status failed: {error}"))?;
let statm = std::fs::read_to_string("/proc/self/statm")
.map_err(|error| format!("read /proc/self/statm failed: {error}"))?;
let page_size = linux_page_size_bytes()?;
let rss_bytes = parse_status_kb(&status, "VmRSS:")
.map(|value| value * 1024)
.or_else(|| parse_statm_pages(&statm, 1).map(|value| value * page_size))
.ok_or_else(|| "missing VmRSS/statm resident field".to_string())?;
let virtual_bytes = parse_status_kb(&status, "VmSize:")
.map(|value| value * 1024)
.or_else(|| parse_statm_pages(&statm, 0).map(|value| value * page_size))
.ok_or_else(|| "missing VmSize/statm size field".to_string())?;
let private_bytes = parse_status_kb(&status, "VmData:").map(|value| value * 1024);
let thread_count = parse_status_u64(&status, "Threads:")
.ok_or_else(|| "missing Threads field".to_string())?;
Ok(ProcessMetricsSnapshot {
rss_bytes,
private_bytes,
virtual_bytes: Some(virtual_bytes),
thread_count,
windows_handle_count: None,
unix_fd_count: linux_fd_count(),
})
}
#[cfg(target_os = "linux")]
fn linux_page_size_bytes() -> Result<u64, String> {
let output = std::process::Command::new("getconf")
.arg("PAGESIZE")
.output()
.map_err(|error| format!("getconf PAGESIZE failed: {error}"))?;
if !output.status.success() {
return Err(format!("getconf PAGESIZE exited with {}", output.status));
}
let text = String::from_utf8(output.stdout)
.map_err(|error| format!("getconf PAGESIZE output is not utf8: {error}"))?;
text.trim()
.parse::<u64>()
.map_err(|error| format!("parse PAGESIZE failed: {error}"))
}
#[cfg(target_os = "linux")]
fn linux_fd_count() -> Option<u64> {
let entries = std::fs::read_dir("/proc/self/fd").ok()?;
Some(entries.filter_map(Result::ok).count() as u64)
}
#[cfg(target_os = "linux")]
fn parse_status_kb(status: &str, key: &str) -> Option<u64> {
parse_status_u64(status, key)
}
#[cfg(target_os = "linux")]
fn parse_status_u64(status: &str, key: &str) -> Option<u64> {
status.lines().find_map(|line| {
let rest = line.strip_prefix(key)?.trim();
rest.split_whitespace().next()?.parse::<u64>().ok()
})
}
#[cfg(target_os = "linux")]
fn parse_statm_pages(statm: &str, index: usize) -> Option<u64> {
statm
.split_whitespace()
.nth(index)?
.parse::<u64>()
.ok()
}
#[cfg(not(any(windows, target_os = "linux")))]
fn collect_process_metrics() -> Result<ProcessMetricsSnapshot, String> {
Err("process metrics are only implemented for Windows and Linux".to_string())
}
#[cfg(test)]
mod tests {
#[cfg(target_os = "linux")]
use super::{parse_statm_pages, parse_status_kb, parse_status_u64};
#[cfg(target_os = "linux")]
#[test]
fn parses_linux_proc_status_memory_fields() {
let status = "Name:\tapi-server\nVmSize:\t 123456 kB\nVmRSS:\t 7890 kB\nVmData:\t 3456 kB\nThreads:\t37\n";
assert_eq!(parse_status_kb(status, "VmRSS:"), Some(7890));
assert_eq!(parse_status_kb(status, "VmSize:"), Some(123456));
assert_eq!(parse_status_kb(status, "VmData:"), Some(3456));
assert_eq!(parse_status_u64(status, "Threads:"), Some(37));
}
#[cfg(target_os = "linux")]
#[test]
fn parses_linux_statm_pages() {
assert_eq!(parse_statm_pages("100 20 0 0 0 0 0", 0), Some(100));
assert_eq!(parse_statm_pages("100 20 0 0 0 0 0", 1), Some(20));
assert_eq!(parse_statm_pages("100 20", 7), None);
}
}

View File

@@ -1529,15 +1529,19 @@ pub async fn claim_puzzle_work_point_incentive(
pub async fn list_puzzle_gallery(
State(state): State<AppState>,
Extension(request_context): Extension<RequestContext>,
) -> Result<Json<Value>, Response> {
) -> Result<Response, Response> {
if let Some(response) = state.puzzle_gallery_cache().read_fresh_response().await {
crate::telemetry::record_puzzle_gallery_cache_hit();
return Ok(puzzle_gallery_cached_json(&request_context, response));
}
crate::telemetry::record_puzzle_gallery_cache_miss();
let _rebuild_guard = state.puzzle_gallery_cache().acquire_rebuild_guard().await;
if let Some(response) = state.puzzle_gallery_cache().read_fresh_response().await {
crate::telemetry::record_puzzle_gallery_cache_hit();
return Ok(puzzle_gallery_cached_json(&request_context, response));
}
let rebuild_started_at = std::time::Instant::now();
let items = state
.spacetime_client()
.list_puzzle_gallery()
@@ -1556,12 +1560,26 @@ pub async fn list_puzzle_gallery(
.map(|item| map_puzzle_gallery_card_response(&state, item))
.collect(),
);
state
let cached_response = state
.puzzle_gallery_cache()
.store_response(response.clone())
.await;
.store_response(response)
.await
.map_err(|error| {
puzzle_error_response(
&request_context,
PUZZLE_GALLERY_PROVIDER,
AppError::from_status(StatusCode::INTERNAL_SERVER_ERROR).with_details(json!({
"provider": PUZZLE_GALLERY_PROVIDER,
"message": format!("拼图广场缓存序列化失败:{error}"),
})),
)
})?;
crate::telemetry::record_puzzle_gallery_cache_rebuild(
rebuild_started_at.elapsed(),
cached_response.data_json_len(),
);
Ok(json_success_body(Some(&request_context), response))
Ok(puzzle_gallery_cached_json(&request_context, cached_response))
}
pub async fn get_puzzle_gallery_detail(

View File

@@ -3,8 +3,8 @@ use std::{
time::{Duration, Instant},
};
use axum::Json;
use serde_json::Value;
use axum::response::Response;
use bytes::Bytes;
use shared_contracts::{
puzzle_gallery::{PuzzleGalleryResponse, PuzzleGalleryWorkRefResponse},
puzzle_works::PuzzleWorkSummaryResponse,
@@ -14,7 +14,7 @@ use tokio::{
time,
};
use crate::{api_response::json_success_body, request_context::RequestContext};
use crate::{api_response::json_success_data_bytes_response, request_context::RequestContext};
const PUZZLE_GALLERY_PRIMARY_ITEM_COUNT: usize = 10;
const PUZZLE_GALLERY_PREVIEW_REF_COUNT: usize = 10;
@@ -30,10 +30,21 @@ pub struct PuzzleGalleryCache {
#[derive(Clone, Debug)]
struct PuzzleGalleryCacheEntry {
response: PuzzleGalleryResponse,
data_json: Bytes,
built_at: Instant,
}
#[derive(Clone, Debug)]
pub struct PuzzleGalleryCachedResponse {
data_json: Bytes,
}
impl PuzzleGalleryCachedResponse {
pub fn data_json_len(&self) -> usize {
self.data_json.len()
}
}
impl PuzzleGalleryCache {
pub fn new() -> Self {
Self {
@@ -46,22 +57,31 @@ impl PuzzleGalleryCache {
self.rebuild_lock.lock().await
}
pub async fn read_fresh_response(&self) -> Option<PuzzleGalleryResponse> {
pub async fn read_fresh_response(&self) -> Option<PuzzleGalleryCachedResponse> {
let guard = self.inner.read().await;
let entry = guard.as_ref()?;
let now = Instant::now();
if now.duration_since(entry.built_at) > PUZZLE_GALLERY_CACHE_TTL {
return None;
}
Some(entry.response.clone())
Some(PuzzleGalleryCachedResponse {
data_json: entry.data_json.clone(),
})
}
pub async fn store_response(&self, response: PuzzleGalleryResponse) {
pub async fn store_response(
&self,
response: PuzzleGalleryResponse,
) -> Result<PuzzleGalleryCachedResponse, serde_json::Error> {
let now = Instant::now();
let cached = PuzzleGalleryCachedResponse {
data_json: Bytes::from(serde_json::to_vec(&response)?),
};
*self.inner.write().await = Some(PuzzleGalleryCacheEntry {
response,
data_json: cached.data_json.clone(),
built_at: now,
});
Ok(cached)
}
pub fn spawn_cleanup_task(&self) {
@@ -118,9 +138,9 @@ pub fn build_puzzle_gallery_window_response(
pub fn puzzle_gallery_cached_json(
request_context: &RequestContext,
response: PuzzleGalleryResponse,
) -> Json<Value> {
json_success_body(Some(request_context), response)
response: PuzzleGalleryCachedResponse,
) -> Response {
json_success_data_bytes_response(Some(request_context), response.data_json)
}
#[cfg(test)]

View File

@@ -4,11 +4,19 @@ use axum::{
http::{HeaderMap, Request, Response},
middleware::Next,
};
use http_body_util::BodyExt;
use opentelemetry::{KeyValue, global, metrics::Counter};
use std::sync::{
Arc, OnceLock,
atomic::{AtomicI64, Ordering},
};
use tracing::{info, warn};
use crate::{request_context::resolve_request_id, state::AppState};
static HTTP_RESPONSE_BODY_IN_FLIGHT: AtomicI64 = AtomicI64::new(0);
static HTTP_REQUEST_PERMITS_AVAILABLE: OnceLock<Arc<AtomicI64>> = OnceLock::new();
// 集中维护 api-server HTTP 观测,避免在 handler 中散落高基数字段或重复创建 instrument。
pub async fn record_http_observability(
State(state): State<AppState>,
@@ -67,7 +75,46 @@ pub async fn record_http_observability(
);
}
response
track_response_body_in_flight(response)
}
pub(crate) fn update_http_request_permits_available(available: usize) {
let gauge = HTTP_REQUEST_PERMITS_AVAILABLE.get_or_init(|| {
let gauge = Arc::new(AtomicI64::new(0));
register_http_request_permits_available_metric(gauge.clone());
gauge
});
gauge.store(available.min(i64::MAX as usize) as i64, Ordering::Relaxed);
}
pub(crate) fn record_puzzle_gallery_cache_hit() {
puzzle_gallery_cache_metrics().hits.add(1, &[]);
}
pub(crate) fn record_puzzle_gallery_cache_miss() {
puzzle_gallery_cache_metrics().misses.add(1, &[]);
}
pub(crate) fn record_puzzle_gallery_cache_rebuild(duration: std::time::Duration, data_bytes: usize) {
let metrics = puzzle_gallery_cache_metrics();
metrics.rebuilds.add(1, &[]);
metrics
.rebuild_duration
.record(duration.as_secs_f64(), &[]);
metrics
.data_json_bytes
.record(data_bytes.min(u64::MAX as usize) as u64, &[]);
}
fn track_response_body_in_flight(response: Response<Body>) -> Response<Body> {
response.map(|body| {
HTTP_RESPONSE_BODY_IN_FLIGHT.fetch_add(1, Ordering::Relaxed);
let guard = ResponseBodyInFlightGuard;
Body::new(body.map_frame(move |frame| {
let _guard = &guard;
frame
}))
})
}
struct HttpMetrics {
@@ -76,6 +123,22 @@ struct HttpMetrics {
duration: opentelemetry::metrics::Histogram<f64>,
}
struct PuzzleGalleryCacheMetrics {
hits: Counter<u64>,
misses: Counter<u64>,
rebuilds: Counter<u64>,
rebuild_duration: opentelemetry::metrics::Histogram<f64>,
data_json_bytes: opentelemetry::metrics::Histogram<u64>,
}
struct ResponseBodyInFlightGuard;
impl Drop for ResponseBodyInFlightGuard {
fn drop(&mut self) {
HTTP_RESPONSE_BODY_IN_FLIGHT.fetch_sub(1, Ordering::Relaxed);
}
}
fn http_metrics() -> &'static HttpMetrics {
static METRICS: std::sync::OnceLock<HttpMetrics> = std::sync::OnceLock::new();
METRICS.get_or_init(|| {
@@ -99,6 +162,64 @@ fn http_metrics() -> &'static HttpMetrics {
})
}
fn puzzle_gallery_cache_metrics() -> &'static PuzzleGalleryCacheMetrics {
static METRICS: std::sync::OnceLock<PuzzleGalleryCacheMetrics> = std::sync::OnceLock::new();
METRICS.get_or_init(|| {
let meter = global::meter("genarrative-api");
PuzzleGalleryCacheMetrics {
hits: meter
.u64_counter("genarrative.puzzle_gallery.cache.hits")
.with_description("Puzzle gallery response cache hits")
.build(),
misses: meter
.u64_counter("genarrative.puzzle_gallery.cache.misses")
.with_description("Puzzle gallery response cache misses")
.build(),
rebuilds: meter
.u64_counter("genarrative.puzzle_gallery.cache.rebuilds")
.with_description("Puzzle gallery response cache rebuild count")
.build(),
rebuild_duration: meter
.f64_histogram("genarrative.puzzle_gallery.cache.rebuild.duration")
.with_unit("s")
.with_description("Puzzle gallery response cache rebuild duration")
.build(),
data_json_bytes: meter
.u64_histogram("genarrative.puzzle_gallery.cache.data_json_bytes")
.with_unit("By")
.with_description("Serialized puzzle gallery data JSON size")
.build(),
}
})
}
fn register_http_request_permits_available_metric(gauge: Arc<AtomicI64>) {
let meter = global::meter("genarrative-api");
meter
.i64_observable_up_down_counter("genarrative.http.server.request_permits.available")
.with_unit("{permit}")
.with_description("Available api-server HTTP backpressure permits")
.with_callback(move |observer| {
observer.observe(gauge.load(Ordering::Relaxed), &[]);
})
.build();
}
pub(crate) fn register_http_runtime_metrics() {
static REGISTERED: OnceLock<()> = OnceLock::new();
REGISTERED.get_or_init(|| {
let meter = global::meter("genarrative-api");
meter
.i64_observable_up_down_counter("genarrative.http.server.response_bodies.in_flight")
.with_unit("{response}")
.with_description("HTTP response bodies still owned by Axum/Hyper")
.with_callback(|observer| {
observer.observe(HTTP_RESPONSE_BODY_IN_FLIGHT.load(Ordering::Relaxed), &[]);
})
.build();
});
}
fn http_base_labels(method: String, route: String) -> Vec<KeyValue> {
vec![
KeyValue::new("http.request.method", method),

View File

@@ -74,7 +74,7 @@ impl SpacetimeClient {
pub async fn list_big_fish_gallery(
&self,
) -> Result<Vec<BigFishWorkSummaryRecord>, SpacetimeClientError> {
self.read_after_connect(move |connection| {
self.read_after_connect("list_big_fish_gallery", move |connection| {
let recent_play_counts = public_work_recent_play_counts(connection, "big-fish");
let mut items = connection
.db()

View File

@@ -199,7 +199,7 @@ impl SpacetimeClient {
async fn read_custom_world_gallery_entries_from_cache(
&self,
) -> Result<Vec<CustomWorldGalleryEntryRecord>, SpacetimeClientError> {
self.read_after_connect(move |connection| {
self.read_after_connect("list_custom_world_gallery", move |connection| {
let recent_play_counts = public_work_recent_play_counts(connection, "custom-world");
let mut entries = connection
.db()

View File

@@ -407,12 +407,21 @@ impl SpacetimeClient {
async fn read_after_connect<T>(
&self,
read_name: &'static str,
read: impl FnOnce(&DbConnection) -> Result<T, SpacetimeClientError> + Send + 'static,
) -> Result<T, SpacetimeClientError>
where
T: Send + 'static,
{
let lease = self.acquire_connection().await?;
let metrics_guard = telemetry::begin_read(read_name);
let lease = match self.acquire_connection().await {
Ok(lease) => lease,
Err(error) => {
let final_result = Err(error);
metrics_guard.finish(&final_result);
return final_result;
}
};
let final_result = if let Some(connection) = lease.connection.as_ref() {
read(&connection.connection)
} else {
@@ -422,6 +431,7 @@ impl SpacetimeClient {
};
self.release_connection(lease).await;
metrics_guard.finish(&final_result);
final_result
}

View File

@@ -225,7 +225,7 @@ impl SpacetimeClient {
pub async fn list_match3d_gallery(
&self,
) -> Result<Vec<Match3DWorkProfileRecord>, SpacetimeClientError> {
self.read_after_connect(move |connection| {
self.read_after_connect("list_match3d_gallery", move |connection| {
let mut items = connection
.db()
.match_3_d_gallery_view()

View File

@@ -403,7 +403,7 @@ impl SpacetimeClient {
pub async fn list_puzzle_gallery(
&self,
) -> Result<Vec<PuzzleGalleryCardRecord>, SpacetimeClientError> {
self.read_after_connect(move |connection| {
self.read_after_connect("list_puzzle_gallery", move |connection| {
let mut items = connection
.db()
.puzzle_gallery_card_view()

View File

@@ -5,7 +5,7 @@ impl SpacetimeClient {
&self,
) -> Result<CreationEntryConfigRecord, SpacetimeClientError> {
match self
.read_after_connect(move |connection| {
.read_after_connect("get_creation_entry_config", move |connection| {
let config_id = module_runtime::CREATION_ENTRY_CONFIG_GLOBAL_ID.to_string();
let header = connection
.db()

View File

@@ -228,7 +228,7 @@ impl SpacetimeClient {
pub async fn list_square_hole_gallery(
&self,
) -> Result<Vec<SquareHoleWorkProfileRecord>, SpacetimeClientError> {
self.read_after_connect(move |connection| {
self.read_after_connect("list_square_hole_gallery", move |connection| {
let mut items = connection
.db()
.square_hole_gallery_view()

View File

@@ -10,6 +10,11 @@ pub(crate) struct ProcedureMetricsGuard {
started_at: std::time::Instant,
}
pub(crate) struct ReadMetricsGuard {
read: &'static str,
started_at: std::time::Instant,
}
pub(crate) fn begin_procedure(procedure: &'static str) -> ProcedureMetricsGuard {
ProcedureMetricsGuard {
procedure,
@@ -17,6 +22,13 @@ pub(crate) fn begin_procedure(procedure: &'static str) -> ProcedureMetricsGuard
}
}
pub(crate) fn begin_read(read: &'static str) -> ReadMetricsGuard {
ReadMetricsGuard {
read,
started_at: std::time::Instant::now(),
}
}
impl ProcedureMetricsGuard {
pub(crate) fn finish<T>(&self, result: &Result<T, SpacetimeClientError>) {
let duration = self.started_at.elapsed();
@@ -24,10 +36,20 @@ impl ProcedureMetricsGuard {
}
}
impl ReadMetricsGuard {
pub(crate) fn finish<T>(&self, result: &Result<T, SpacetimeClientError>) {
let duration = self.started_at.elapsed();
record_read(self.read, duration, result.is_err());
}
}
struct SpacetimeMetrics {
calls: Counter<u64>,
errors: Counter<u64>,
duration_ms: opentelemetry::metrics::Histogram<f64>,
read_calls: Counter<u64>,
read_errors: Counter<u64>,
read_duration_ms: opentelemetry::metrics::Histogram<f64>,
}
fn spacetime_metrics() -> &'static SpacetimeMetrics {
@@ -48,6 +70,19 @@ fn spacetime_metrics() -> &'static SpacetimeMetrics {
.with_unit("ms")
.with_description("SpacetimeDB procedure duration in milliseconds")
.build(),
read_calls: meter
.u64_counter("genarrative.spacetime.read.calls")
.with_description("SpacetimeDB local subscription cache read count")
.build(),
read_errors: meter
.u64_counter("genarrative.spacetime.read.errors")
.with_description("SpacetimeDB local subscription cache read error count")
.build(),
read_duration_ms: meter
.f64_histogram("genarrative.spacetime.read.duration_ms")
.with_unit("ms")
.with_description("SpacetimeDB local subscription cache read duration in milliseconds")
.build(),
}
})
}
@@ -66,3 +101,18 @@ fn record_procedure(procedure: &'static str, duration: Duration, failed: bool) {
metrics.errors.add(1, &labels);
}
}
fn record_read(read: &'static str, duration: Duration, failed: bool) {
let labels = vec![
KeyValue::new("read", read),
KeyValue::new("status_class", if failed { "error" } else { "ok" }),
];
let metrics = spacetime_metrics();
metrics.read_calls.add(1, &labels);
metrics
.read_duration_ms
.record(duration.as_secs_f64() * 1000.0, &labels);
if failed {
metrics.read_errors.add(1, &labels);
}
}

View File

@@ -239,7 +239,7 @@ impl SpacetimeClient {
pub async fn list_visual_novel_gallery(
&self,
) -> Result<Vec<VisualNovelWorkProfileRecord>, SpacetimeClientError> {
self.read_after_connect(move |connection| {
self.read_after_connect("list_visual_novel_gallery", move |connection| {
let mut items = connection
.db()
.visual_novel_gallery_view()