Implement per-service disk usage monitoring

Replaced system-wide disk usage with accurate per-service tracking by scanning service-specific directories. Services like sshd now correctly show minimal disk usage instead of misleading system totals. - Rename storage widget and add drive capacity/usage columns - Move host display to main dashboard title for cleaner layout - Replace separate alert displays with color-coded row highlighting - Add per-service disk usage collection using du command - Update services widget formatting to handle small disk values - Restructure into workspace with dedicated agent and dashboard packages
2025-10-11 22:59:16 +02:00
parent 82afe3d4f1
commit 2581435b10
30 changed files with 4801 additions and 446 deletions
--- a/dashboard/src/main.rs
+++ b/dashboard/src/main.rs
@@ -6,7 +6,10 @@ mod ui;
 use std::fs;
 use std::io::{self, Stdout};
 use std::path::{Path, PathBuf};
-use std::sync::OnceLock;
+use std::sync::{
+    atomic::{AtomicBool, Ordering},
+    Arc, OnceLock,
+};
 use std::time::Duration;

 use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics};
@@ -100,8 +103,14 @@ async fn main() -> Result<()> {
    let mut app = App::new(options)?;
    let (event_tx, mut event_rx) = unbounded_channel();

+    let shutdown_flag = Arc::new(AtomicBool::new(false));
+
    let zmq_task = if let Some(context) = app.zmq_context() {
-        Some(spawn_metrics_task(context, event_tx.clone()))
+        Some(spawn_metrics_task(
+            context,
+            event_tx.clone(),
+            shutdown_flag.clone(),
+        ))
    } else {
        None
    };
@@ -109,9 +118,12 @@ async fn main() -> Result<()> {
    let mut terminal = setup_terminal()?;
    let result = run_app(&mut terminal, &mut app, &mut event_rx);
    teardown_terminal(terminal)?;
+    shutdown_flag.store(true, Ordering::Relaxed);
    let _ = event_tx.send(AppEvent::Shutdown);
    if let Some(handle) = zmq_task {
-        handle.abort();
+        if let Err(join_error) = handle.await {
+            warn!(%join_error, "ZMQ metrics task ended unexpectedly");
+        }
    }
    result
 }
@@ -206,9 +218,13 @@ fn prepare_log_writer() -> Result<tracing_appender::non_blocking::NonBlocking> {
    Ok(non_blocking)
 }

-fn spawn_metrics_task(context: ZmqContext, sender: UnboundedSender<AppEvent>) -> JoinHandle<()> {
+fn spawn_metrics_task(
+    context: ZmqContext,
+    sender: UnboundedSender<AppEvent>,
+    shutdown: Arc<AtomicBool>,
+) -> JoinHandle<()> {
    tokio::spawn(async move {
-        match spawn_blocking(move || metrics_blocking_loop(context, sender)).await {
+        match spawn_blocking(move || metrics_blocking_loop(context, sender, shutdown)).await {
            Ok(Ok(())) => {}
            Ok(Err(error)) => warn!(%error, "ZMQ metrics worker exited with error"),
            Err(join_error) => warn!(%join_error, "ZMQ metrics worker panicked"),
@@ -216,12 +232,23 @@ fn spawn_metrics_task(context: ZmqContext, sender: UnboundedSender<AppEvent>) ->
    })
 }

-fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender<AppEvent>) -> Result<()> {
+fn metrics_blocking_loop(
+    context: ZmqContext,
+    sender: UnboundedSender<AppEvent>,
+    shutdown: Arc<AtomicBool>,
+) -> Result<()> {
    let zmq_context = NativeZmqContext::new();
    let socket = zmq_context
        .socket(zmq::SUB)
        .context("failed to create ZMQ SUB socket")?;

+    socket
+        .set_linger(0)
+        .context("failed to configure ZMQ linger")?;
+    socket
+        .set_rcvtimeo(1_000)
+        .context("failed to configure ZMQ receive timeout")?;
+
    for endpoint in context.endpoints() {
        debug!(%endpoint, "connecting to ZMQ endpoint");
        socket
@@ -239,7 +266,7 @@ fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender<AppEvent>)
            .context("failed to subscribe to all ZMQ topics")?;
    }

-    loop {
+    while !shutdown.load(Ordering::Relaxed) {
        match socket.recv_msg(0) {
            Ok(message) => {
                if let Err(error) = handle_zmq_message(&message, &sender) {
@@ -247,11 +274,18 @@ fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender<AppEvent>)
                }
            }
            Err(error) => {
+                if error == zmq::Error::EAGAIN {
+                    continue;
+                }
                warn!(%error, "ZMQ receive error");
-                std::thread::sleep(Duration::from_secs(1));
+                std::thread::sleep(Duration::from_millis(250));
            }
        }
    }
+
+    debug!("ZMQ metrics worker shutting down");
+
+    Ok(())
 }

 fn handle_zmq_message(
@@ -442,7 +476,7 @@ tick_rate_ms = 250
 history_duration_minutes = 60

 [[dashboard.widgets]]
-id = "nvme"
+id = "storage"
 enabled = true

 [[dashboard.widgets]]