diff --git a/CLAUDE.md b/CLAUDE.md index 737ed06..0b20d97 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -14,10 +14,11 @@ A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure. ### Key Features - **NVMe health monitoring** with wear prediction -- **RAM optimization tracking** (tmpfs, zram, kernel metrics) -- **Service resource monitoring** with sandboxed limits +- **CPU / memory / GPU telemetry** with automatic thresholding +- **Service resource monitoring** with per-service CPU and RAM usage +- **Disk usage overview** for root filesystems - **Backup status** with detailed metrics and history -- **Email notification integration** +- **Unified alert pipeline** summarising host health - **Historical data tracking** and trend analysis ## Technical Architecture @@ -93,8 +94,10 @@ cm-dashboard/ 2. **Service Metrics API** (port 6128) - Service status and resource usage - - Memory consumption vs limits - - Disk usage per service + - Service memory consumption vs limits + - Host CPU load / frequency / temperature + - Root disk utilisation snapshot + - GPU utilisation and temperature (if available) 3. **Backup Metrics API** (port 6129) - Backup status and history @@ -119,6 +122,26 @@ pub struct ServiceMetrics { pub timestamp: u64, } +#[derive(Deserialize, Debug)] +pub struct ServiceSummary { + pub healthy: usize, + pub degraded: usize, + pub failed: usize, + pub memory_used_mb: f32, + pub memory_quota_mb: f32, + pub system_memory_used_mb: f32, + pub system_memory_total_mb: f32, + pub disk_used_gb: f32, + pub disk_total_gb: f32, + pub cpu_load_1: f32, + pub cpu_load_5: f32, + pub cpu_load_15: f32, + pub cpu_freq_mhz: Option, + pub cpu_temp_c: Option, + pub gpu_load_percent: Option, + pub gpu_temp_c: Option, +} + #[derive(Deserialize, Debug)] pub struct BackupMetrics { pub overall_status: String, @@ -617,4 +640,4 @@ smartmontools-rs = "0.1" # Or direct smartctl bindings **Performance Targets**: - **Agent footprint**: < 2MB RAM, < 1% CPU - **Metric latency**: < 100ms propagation across network -- **Network efficiency**: < 1KB/s per host steady state \ No newline at end of file +- **Network efficiency**: < 1KB/s per host steady state diff --git a/Cargo.lock b/Cargo.lock index 9578eb2..cebc85a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -98,6 +98,17 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "autocfg" version = "1.5.0" @@ -240,6 +251,7 @@ dependencies = [ "clap", "cm-dashboard-shared", "crossterm", + "gethostname", "ratatui", "serde", "serde_json", @@ -256,13 +268,18 @@ name = "cm-dashboard-agent" version = "0.1.0" dependencies = [ "anyhow", + "async-trait", "chrono", "clap", "cm-dashboard-shared", + "futures", + "gethostname", "rand", "serde", "serde_json", + "thiserror", "tokio", + "toml", "tracing", "tracing-appender", "tracing-subscriber", @@ -415,6 +432,105 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "gethostname" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0176e0459c2e4a1fe232f984bca6890e681076abb9934f6cea7c326f3fc47818" +dependencies = [ + "libc", + "windows-targets 0.48.5", +] + [[package]] name = "getrandom" version = "0.2.16" @@ -738,6 +854,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.32" diff --git a/README.md b/README.md index b44fcf7..1c1d37b 100644 --- a/README.md +++ b/README.md @@ -6,20 +6,22 @@ CM Dashboard is a Rust-powered terminal UI for real-time monitoring of CMTEC inf ┌──────────────────────────────────────────────────────────────────────────────┐ │ CM Dashboard │ ├────────────────────────────┬────────────────────────────┬────────────────────┤ -│ NVMe Health │ Services │ Memory Optimization │ +│ NVMe Health │ Services │ CPU / Memory │ │ Host: srv01 │ Host: srv01 │ Host: srv01 │ -│ Status: Healthy │ Services healthy: 5 │ Memory used: 2048 / │ -│ Drives healthy/warn/crit: │ Degraded: 1 Failed: 0 │ 4096 MiB (50.0%) │ -│ 4/0/0 │ CPU top service: 71.3% │ Last update: 12:34: │ -│ Capacity used: 512.0 / │ Total memory: 1536 / 2048 │ 56 │ -│ 2048.0 GiB │ MiB │ │ +│ Status: Healthy │ Service memory: 1.2G/4.0G │ RAM: 6.9 / 7.8 GiB │ +│ Healthy/Warning/Critical: │ Disk usage: 45 / 500 GiB │ CPU load (1/5/15): │ +│ 4 / 0 / 0 │ Services tracked: 8 │ 1.2 0.9 0.7 │ +│ Capacity used: 512 / 2048G │ │ CPU temp: 68°C │ +│ Issue: — │ nginx running 320M │ GPU temp: — │ +│ │ immich running 1.2G │ Status • ok │ +│ │ backup-api running 40M │ │ ├────────────────────────────┴────────────┬───────────────┴────────────────────┤ │ Backups │ Alerts │ -│ Host: srv01 │ srv01: OK │ -│ Status: Healthy │ labbox: smart warning │ -│ Last success: 2024-02-01 03:12:45 │ │ -│ Snapshots: 17 • Size: 512.0 GiB │ │ -│ Pending jobs: 0 (enabled: true) │ │ +│ Host: srv01 │ srv01: ok │ +│ Overall: Healthy │ labbox: warning: RAM 82% │ +│ Last success: 2024-02-01 03:12:45 │ cmbox: critical: CPU temp 92°C │ +│ Snapshots: 17 • Size: 512.0 GiB │ Update: 2024-02-01 10:15:32 │ +│ Pending jobs: 0 (enabled: true) │ │ └──────────────────────────────┬───────────────────────────────────────────────┘ │ Status │ │ │ Active host: srv01 (1/3) │ History retention ≈ 3600s │ @@ -99,7 +101,8 @@ Adjust the host list and `data_source.zmq.endpoints` to match your CMTEC gossip ## Features - Rotating host selection with left/right arrows (`←`, `→`, `h`, `l`, `Tab`) -- Live NVMe, service, memory, backup, and alert summaries per active host +- Live NVMe, service, CPU/memory, backup, and alert panels per host +- Health scoring that rolls CPU/RAM/GPU pressure into alerts automatically - Structured logging with `tracing` (`-v`/`-vv` to increase verbosity) - Help overlay (`?`) outlining keyboard shortcuts - Config-driven host discovery via `config/dashboard.toml` diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 9c9c6ac..8424e6e 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -6,13 +6,18 @@ edition = "2021" [dependencies] cm-dashboard-shared = { path = "../shared" } anyhow = "1.0" +async-trait = "0.1" clap = { version = "4.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" chrono = { version = "0.4", features = ["serde"] } +thiserror = "1.0" +toml = "0.8" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] } tracing-appender = "0.2" zmq = "0.10" -tokio = { version = "1.0", features = ["full"] } +tokio = { version = "1.0", features = ["full", "process"] } +futures = "0.3" rand = "0.8" +gethostname = "0.4" diff --git a/agent/src/agent.rs b/agent/src/agent.rs new file mode 100644 index 0000000..76f65cb --- /dev/null +++ b/agent/src/agent.rs @@ -0,0 +1,263 @@ +use cm_dashboard_shared::envelope::MessageEnvelope; +use std::sync::Arc; +use tokio::sync::mpsc; +use tokio::time::{interval, Duration}; +use tracing::{debug, error, info, warn}; +use zmq::{Context, SocketType}; + +use crate::collectors::{ + backup::BackupCollector, service::ServiceCollector, smart::SmartCollector, AgentType, + CollectorOutput, +}; +use crate::config::AgentConfig; +use crate::scheduler::{CollectorScheduler, HealthChecker, HealthStatus}; + +pub struct MetricsAgent { + config: AgentConfig, + scheduler: CollectorScheduler, + health_checker: Option, +} + +impl MetricsAgent { + pub fn from_config(config: AgentConfig) -> Result> { + let mut agent = Self::new(config)?; + agent.initialize_collectors()?; + Ok(agent) + } + + pub fn new(config: AgentConfig) -> Result> { + Ok(Self { + config, + scheduler: CollectorScheduler::new(), + health_checker: None, + }) + } + + pub fn initialize_collectors(&mut self) -> Result<(), Box> { + info!("Initializing collectors..."); + + // Create SMART collector + if self.config.collectors.smart.enabled { + let smart_collector = SmartCollector::new( + self.config.collectors.smart.enabled, + self.config.collectors.smart.interval_ms, + self.config.collectors.smart.devices.clone(), + ); + self.scheduler.add_collector(Arc::new(smart_collector)); + info!("SMART collector initialized"); + } + + // Create Service collector + if self.config.collectors.service.enabled { + let service_collector = ServiceCollector::new( + self.config.collectors.service.enabled, + self.config.collectors.service.interval_ms, + self.config.collectors.service.services.clone(), + ); + self.scheduler.add_collector(Arc::new(service_collector)); + info!("Service collector initialized"); + } + + // Create Backup collector + if self.config.collectors.backup.enabled { + let backup_collector = BackupCollector::new( + self.config.collectors.backup.enabled, + self.config.collectors.backup.interval_ms, + self.config.collectors.backup.restic_repo.clone(), + self.config.collectors.backup.backup_service.clone(), + ); + self.scheduler.add_collector(Arc::new(backup_collector)); + info!("Backup collector initialized"); + } + + let enabled_count = self.config.get_enabled_collector_count(); + if enabled_count == 0 { + return Err("No collectors are enabled".into()); + } + + info!("Initialized {} collectors", enabled_count); + Ok(()) + } + + pub async fn run(&mut self) -> Result<(), Box> { + info!( + "Starting metrics agent for host '{}'", + self.config.agent.hostname + ); + + // Initialize health checker + let stats = self.scheduler.get_stats_handle(); + self.health_checker = Some(HealthChecker::new(stats)); + + // Forward successful collection results to the publisher + let (metrics_tx, metrics_rx) = mpsc::unbounded_channel(); + self.scheduler.set_metrics_sender(metrics_tx); + let publisher_task = self.start_publisher_task(metrics_rx)?; + + // Start health monitoring task + let health_task = self.start_health_monitoring_task().await?; + + // Start the collector scheduler (this will block) + let scheduler_result = self.scheduler.start().await; + + // Drop the metrics sender so the publisher can exit cleanly + self.scheduler.clear_metrics_sender(); + + // Wait for background tasks to complete + if let Err(join_error) = health_task.await { + warn!("Health monitoring task ended unexpectedly: {}", join_error); + } + + if let Err(join_error) = publisher_task.await { + warn!("Publisher task ended unexpectedly: {}", join_error); + } + + match scheduler_result { + Ok(_) => { + info!("Agent shutdown completed successfully"); + Ok(()) + } + Err(e) => { + error!("Agent encountered an error: {}", e); + Err(e.into()) + } + } + } + + fn start_publisher_task( + &self, + mut metrics_rx: mpsc::UnboundedReceiver, + ) -> Result, Box> { + let bind_address = format!( + "tcp://{}:{}", + self.config.zmq.bind_address, self.config.zmq.port + ); + let send_timeout = self.config.zmq.send_timeout_ms as i32; + let hostname = self.config.agent.hostname.clone(); + + let handle = tokio::spawn(async move { + let context = Context::new(); + + let socket = match context.socket(SocketType::PUB) { + Ok(socket) => socket, + Err(error) => { + error!("Failed to create ZMQ PUB socket: {}", error); + return; + } + }; + + if let Err(error) = socket.set_sndtimeo(send_timeout) { + warn!("Failed to apply ZMQ send timeout: {}", error); + } + + if let Err(error) = socket.bind(&bind_address) { + error!( + "Failed to bind ZMQ publisher to {}: {}", + bind_address, error + ); + return; + } + + info!("ZMQ publisher bound to {}", bind_address); + + while let Some(output) = metrics_rx.recv().await { + let CollectorOutput { + agent_type, + data, + timestamp, + } = output; + + let envelope_agent_type = match agent_type { + AgentType::Smart => cm_dashboard_shared::envelope::AgentType::Smart, + AgentType::Service => cm_dashboard_shared::envelope::AgentType::Service, + AgentType::Backup => cm_dashboard_shared::envelope::AgentType::Backup, + }; + + let epoch = timestamp.timestamp(); + let epoch_u64 = if epoch < 0 { 0 } else { epoch as u64 }; + + let envelope = MessageEnvelope { + hostname: hostname.clone(), + agent_type: envelope_agent_type.clone(), + timestamp: epoch_u64, + metrics: data, + }; + + match serde_json::to_vec(&envelope) { + Ok(serialized) => { + if let Err(error) = socket.send(serialized, 0) { + warn!( + "Failed to publish {:?} metrics: {}", + envelope.agent_type, error + ); + } else { + debug!( + "Published {:?} metrics for host {}", + envelope.agent_type, envelope.hostname + ); + } + } + Err(error) => { + warn!("Failed to serialize metrics envelope: {}", error); + } + } + } + + info!("Metrics publisher task shutting down"); + }); + + Ok(handle) + } + + async fn start_health_monitoring_task( + &self, + ) -> Result, Box> { + let health_checker = self.health_checker.as_ref().unwrap().clone(); + + let task = tokio::spawn(async move { + info!("Starting health monitoring task"); + let mut health_interval = interval(Duration::from_secs(60)); // Check every minute + + loop { + health_interval.tick().await; + + match health_checker.check_health().await { + HealthStatus::Healthy => { + debug!("All collectors are healthy"); + } + HealthStatus::Degraded { + degraded_collectors, + } => { + warn!("Degraded collectors: {:?}", degraded_collectors); + } + HealthStatus::Unhealthy { + unhealthy_collectors, + degraded_collectors, + } => { + error!( + "Unhealthy collectors: {:?}, Degraded: {:?}", + unhealthy_collectors, degraded_collectors + ); + } + } + } + }); + + Ok(task) + } + + pub async fn shutdown(&self) { + info!("Initiating graceful shutdown..."); + self.scheduler.shutdown().await; + + // ZMQ socket will be dropped automatically + + info!("Agent shutdown completed"); + } +} + +impl Drop for MetricsAgent { + fn drop(&mut self) { + // ZMQ socket will be dropped automatically + } +} diff --git a/agent/src/collectors/backup.rs b/agent/src/collectors/backup.rs new file mode 100644 index 0000000..f149af5 --- /dev/null +++ b/agent/src/collectors/backup.rs @@ -0,0 +1,388 @@ +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use std::process::Stdio; +use std::time::Duration; +use tokio::process::Command; +use tokio::time::timeout; + +use super::{AgentType, Collector, CollectorError, CollectorOutput}; + +#[derive(Debug, Clone)] +pub struct BackupCollector { + pub enabled: bool, + pub interval: Duration, + pub restic_repo: Option, + pub backup_service: String, + pub timeout_ms: u64, +} + +impl BackupCollector { + pub fn new( + enabled: bool, + interval_ms: u64, + restic_repo: Option, + backup_service: String, + ) -> Self { + Self { + enabled, + interval: Duration::from_millis(interval_ms), + restic_repo, + backup_service, + timeout_ms: 30000, // 30 second timeout for backup operations + } + } + + async fn get_restic_snapshots(&self) -> Result { + let repo = self + .restic_repo + .as_ref() + .ok_or_else(|| CollectorError::ConfigError { + message: "No restic repository configured".to_string(), + })?; + + let timeout_duration = Duration::from_millis(self.timeout_ms); + + // Get restic snapshots + let output = timeout( + timeout_duration, + Command::new("restic") + .args(["-r", repo, "snapshots", "--json"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output(), + ) + .await + .map_err(|_| CollectorError::Timeout { + duration_ms: self.timeout_ms, + })? + .map_err(|e| CollectorError::CommandFailed { + command: format!("restic -r {} snapshots --json", repo), + message: e.to_string(), + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(CollectorError::CommandFailed { + command: format!("restic -r {} snapshots --json", repo), + message: stderr.to_string(), + }); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let snapshots: Vec = + serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse restic snapshots: {}", e), + })?; + + // Get repository stats + let stats_output = timeout( + timeout_duration, + Command::new("restic") + .args(["-r", repo, "stats", "--json"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output(), + ) + .await + .map_err(|_| CollectorError::Timeout { + duration_ms: self.timeout_ms, + })? + .map_err(|e| CollectorError::CommandFailed { + command: format!("restic -r {} stats --json", repo), + message: e.to_string(), + })?; + + let repo_size_gb = if stats_output.status.success() { + let stats_stdout = String::from_utf8_lossy(&stats_output.stdout); + let stats: Result = serde_json::from_str(&stats_stdout); + stats + .ok() + .map(|s| s.total_size as f32 / (1024.0 * 1024.0 * 1024.0)) + .unwrap_or(0.0) + } else { + 0.0 + }; + + // Find most recent snapshot + let last_success = snapshots.iter().map(|s| s.time).max(); + + Ok(ResticStats { + total_size: (repo_size_gb * 1024.0 * 1024.0 * 1024.0) as u64, + snapshot_count: snapshots.len() as u32, + last_success, + }) + } + + async fn get_backup_service_status(&self) -> Result { + let timeout_duration = Duration::from_millis(self.timeout_ms); + + // Get systemctl status for backup service + let status_output = timeout( + timeout_duration, + Command::new("systemctl") + .args([ + "show", + &self.backup_service, + "--property=ActiveState,SubState,MainPID", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output(), + ) + .await + .map_err(|_| CollectorError::Timeout { + duration_ms: self.timeout_ms, + })? + .map_err(|e| CollectorError::CommandFailed { + command: format!("systemctl show {}", self.backup_service), + message: e.to_string(), + })?; + + let enabled = if status_output.status.success() { + let status_stdout = String::from_utf8_lossy(&status_output.stdout); + status_stdout.contains("ActiveState=active") + || status_stdout.contains("SubState=running") + } else { + false + }; + + // Check for backup timer or service logs for last message + let last_message = self.get_last_backup_log_message().await.ok(); + + // Check for pending backup jobs (simplified - could check systemd timers) + let pending_jobs = 0; // TODO: Implement proper pending job detection + + Ok(BackupServiceData { + enabled, + pending_jobs, + last_message, + }) + } + + async fn get_last_backup_log_message(&self) -> Result { + let output = Command::new("journalctl") + .args([ + "-u", + &self.backup_service, + "--lines=1", + "--no-pager", + "--output=cat", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: format!("journalctl -u {} --lines=1", self.backup_service), + message: e.to_string(), + })?; + + if output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + let message = stdout.trim().to_string(); + if !message.is_empty() { + return Ok(message); + } + } + + Err(CollectorError::ParseError { + message: "No log messages found".to_string(), + }) + } + + async fn get_backup_logs_for_failures(&self) -> Result>, CollectorError> { + let output = Command::new("journalctl") + .args([ + "-u", + &self.backup_service, + "--since", + "1 week ago", + "--grep=failed\\|error\\|ERROR", + "--output=json", + "--lines=1", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: format!( + "journalctl -u {} --since='1 week ago' --grep=failed", + self.backup_service + ), + message: e.to_string(), + })?; + + if output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + if let Ok(log_entry) = serde_json::from_str::(&stdout) { + if let Ok(timestamp) = log_entry.realtime_timestamp.parse::() { + let dt = + DateTime::from_timestamp_micros(timestamp).unwrap_or_else(|| Utc::now()); + return Ok(Some(dt)); + } + } + } + + Ok(None) + } + + fn determine_backup_status( + &self, + restic_stats: &Result, + service_data: &BackupServiceData, + last_failure: Option>, + ) -> BackupStatus { + match restic_stats { + Ok(stats) => { + if let Some(last_success) = stats.last_success { + let hours_since_backup = + Utc::now().signed_duration_since(last_success).num_hours(); + + if hours_since_backup > 48 { + BackupStatus::Warning // More than 2 days since last backup + } else if let Some(failure) = last_failure { + if failure > last_success { + BackupStatus::Failed // Failure after last success + } else { + BackupStatus::Healthy + } + } else { + BackupStatus::Healthy + } + } else { + BackupStatus::Warning // No successful backups found + } + } + Err(_) => { + if service_data.enabled { + BackupStatus::Failed // Service enabled but can't access repo + } else { + BackupStatus::Unknown // Service disabled + } + } + } + } +} + +#[async_trait] +impl Collector for BackupCollector { + fn name(&self) -> &str { + "backup" + } + + fn agent_type(&self) -> AgentType { + AgentType::Backup + } + + fn collect_interval(&self) -> Duration { + self.interval + } + + fn is_enabled(&self) -> bool { + self.enabled + } + + fn requires_root(&self) -> bool { + false // Depends on restic repo permissions + } + + async fn collect(&self) -> Result { + // Get restic repository stats + let restic_stats = self.get_restic_snapshots().await; + + // Get backup service status + let service_data = self + .get_backup_service_status() + .await + .unwrap_or(BackupServiceData { + enabled: false, + pending_jobs: 0, + last_message: None, + }); + + // Check for recent failures + let last_failure = self.get_backup_logs_for_failures().await.unwrap_or(None); + + // Determine overall backup status + let overall_status = + self.determine_backup_status(&restic_stats, &service_data, last_failure); + + let (backup_info, _size_gb) = match &restic_stats { + Ok(stats) => ( + BackupInfo { + last_success: stats.last_success, + last_failure, + size_gb: stats.total_size as f32 / (1024.0 * 1024.0 * 1024.0), + snapshot_count: stats.snapshot_count, + }, + stats.total_size as f32 / (1024.0 * 1024.0 * 1024.0), + ), + Err(_) => ( + BackupInfo { + last_success: None, + last_failure, + size_gb: 0.0, + snapshot_count: 0, + }, + 0.0, + ), + }; + + let backup_metrics = json!({ + "overall_status": overall_status, + "backup": backup_info, + "service": service_data, + "timestamp": Utc::now() + }); + + Ok(CollectorOutput { + agent_type: AgentType::Backup, + data: backup_metrics, + timestamp: Utc::now(), + }) + } +} + +#[derive(Debug, Deserialize)] +struct ResticSnapshot { + time: DateTime, +} + +#[derive(Debug, Deserialize)] +struct ResticStats { + total_size: u64, + snapshot_count: u32, + last_success: Option>, +} + +#[derive(Debug, Serialize)] +struct BackupServiceData { + enabled: bool, + pending_jobs: u32, + last_message: Option, +} + +#[derive(Debug, Serialize)] +struct BackupInfo { + last_success: Option>, + last_failure: Option>, + size_gb: f32, + snapshot_count: u32, +} + +#[derive(Debug, Serialize)] +enum BackupStatus { + Healthy, + Warning, + Failed, + Unknown, +} + +#[derive(Debug, Deserialize)] +struct JournalEntry { + #[serde(rename = "__REALTIME_TIMESTAMP")] + realtime_timestamp: String, +} diff --git a/agent/src/collectors/error.rs b/agent/src/collectors/error.rs new file mode 100644 index 0000000..988ebff --- /dev/null +++ b/agent/src/collectors/error.rs @@ -0,0 +1,53 @@ +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum CollectorError { + #[error("Command execution failed: {command} - {message}")] + CommandFailed { command: String, message: String }, + + #[error("Permission denied: {message}")] + PermissionDenied { message: String }, + + #[error("Data parsing error: {message}")] + ParseError { message: String }, + + #[error("Timeout after {duration_ms}ms")] + Timeout { duration_ms: u64 }, + + #[error("IO error: {message}")] + IoError { message: String }, + + #[error("Configuration error: {message}")] + ConfigError { message: String }, + + #[error("Service not found: {service}")] + ServiceNotFound { service: String }, + + #[error("Device not found: {device}")] + DeviceNotFound { device: String }, + + #[error("External dependency error: {dependency} - {message}")] + ExternalDependency { dependency: String, message: String }, +} + +impl From for CollectorError { + fn from(err: std::io::Error) -> Self { + CollectorError::IoError { + message: err.to_string(), + } + } +} + +impl From for CollectorError { + fn from(err: serde_json::Error) -> Self { + CollectorError::ParseError { + message: err.to_string(), + } + } +} + +impl From for CollectorError { + fn from(_: tokio::time::error::Elapsed) -> Self { + CollectorError::Timeout { duration_ms: 0 } + } +} diff --git a/agent/src/collectors/mod.rs b/agent/src/collectors/mod.rs new file mode 100644 index 0000000..b81d03c --- /dev/null +++ b/agent/src/collectors/mod.rs @@ -0,0 +1,49 @@ +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use serde_json::Value; +use std::time::Duration; + +pub mod backup; +pub mod error; +pub mod service; +pub mod smart; + +pub use error::CollectorError; + +#[derive(Debug, Clone)] +pub enum AgentType { + Smart, + Service, + Backup, +} + +impl AgentType { + pub fn as_str(&self) -> &'static str { + match self { + AgentType::Smart => "smart", + AgentType::Service => "service", + AgentType::Backup => "backup", + } + } +} + +#[derive(Debug, Clone)] +pub struct CollectorOutput { + pub agent_type: AgentType, + pub data: Value, + pub timestamp: DateTime, +} + +#[async_trait] +pub trait Collector: Send + Sync { + fn name(&self) -> &str; + fn agent_type(&self) -> AgentType; + fn collect_interval(&self) -> Duration; + async fn collect(&self) -> Result; + fn is_enabled(&self) -> bool { + true + } + fn requires_root(&self) -> bool { + false + } +} diff --git a/agent/src/collectors/service.rs b/agent/src/collectors/service.rs new file mode 100644 index 0000000..2cbdca5 --- /dev/null +++ b/agent/src/collectors/service.rs @@ -0,0 +1,603 @@ +use async_trait::async_trait; +use chrono::Utc; +use serde::Serialize; +use serde_json::json; +use std::collections::HashMap; +use std::process::Stdio; +use std::time::Duration; +use tokio::fs; +use tokio::process::Command; +use tokio::time::timeout; + +use super::{AgentType, Collector, CollectorError, CollectorOutput}; + +#[derive(Debug, Clone)] +pub struct ServiceCollector { + pub enabled: bool, + pub interval: Duration, + pub services: Vec, + pub timeout_ms: u64, +} + +impl ServiceCollector { + pub fn new(enabled: bool, interval_ms: u64, services: Vec) -> Self { + Self { + enabled, + interval: Duration::from_millis(interval_ms), + services, + timeout_ms: 10000, // 10 second timeout for service checks + } + } + + async fn get_service_status(&self, service: &str) -> Result { + let timeout_duration = Duration::from_millis(self.timeout_ms); + + // Get systemctl status + let status_output = timeout( + timeout_duration, + Command::new("systemctl") + .args(["show", service, "--property=ActiveState,SubState,MainPID"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output(), + ) + .await + .map_err(|_| CollectorError::Timeout { + duration_ms: self.timeout_ms, + })? + .map_err(|e| CollectorError::CommandFailed { + command: format!("systemctl show {}", service), + message: e.to_string(), + })?; + + if !status_output.status.success() { + return Err(CollectorError::ServiceNotFound { + service: service.to_string(), + }); + } + + let status_stdout = String::from_utf8_lossy(&status_output.stdout); + let mut active_state = None; + let mut sub_state = None; + let mut main_pid = None; + + for line in status_stdout.lines() { + if let Some(value) = line.strip_prefix("ActiveState=") { + active_state = Some(value.to_string()); + } else if let Some(value) = line.strip_prefix("SubState=") { + sub_state = Some(value.to_string()); + } else if let Some(value) = line.strip_prefix("MainPID=") { + main_pid = value.parse::().ok(); + } + } + + let status = self.determine_service_status(&active_state, &sub_state); + + // Get resource usage if service is running + let (memory_used_mb, cpu_percent) = if let Some(pid) = main_pid { + self.get_process_resources(pid).await.unwrap_or((0.0, 0.0)) + } else { + (0.0, 0.0) + }; + + // Get memory quota from systemd if available + let memory_quota_mb = self.get_service_memory_limit(service).await.unwrap_or(0.0); + + // Get disk usage for this service + let disk_used_gb = self.get_service_disk_usage(service).await.unwrap_or(0.0); + + Ok(ServiceData { + name: service.to_string(), + status, + memory_used_mb, + memory_quota_mb, + cpu_percent, + sandbox_limit: None, // TODO: Implement sandbox limit detection + disk_used_gb, + }) + } + + fn determine_service_status( + &self, + active_state: &Option, + sub_state: &Option, + ) -> ServiceStatus { + match (active_state.as_deref(), sub_state.as_deref()) { + (Some("active"), Some("running")) => ServiceStatus::Running, + (Some("active"), Some("exited")) => ServiceStatus::Running, // One-shot services + (Some("reloading"), _) | (Some("activating"), _) => ServiceStatus::Restarting, + (Some("failed"), _) | (Some("inactive"), Some("failed")) => ServiceStatus::Stopped, + (Some("inactive"), _) => ServiceStatus::Stopped, + _ => ServiceStatus::Degraded, + } + } + + async fn get_process_resources(&self, pid: u32) -> Result<(f32, f32), CollectorError> { + // Read /proc/{pid}/stat for CPU and memory info + let stat_path = format!("/proc/{}/stat", pid); + let stat_content = + fs::read_to_string(&stat_path) + .await + .map_err(|e| CollectorError::IoError { + message: e.to_string(), + })?; + + let stat_fields: Vec<&str> = stat_content.split_whitespace().collect(); + if stat_fields.len() < 24 { + return Err(CollectorError::ParseError { + message: format!("Invalid /proc/{}/stat format", pid), + }); + } + + // Field 23 is RSS (Resident Set Size) in pages + let rss_pages: u64 = stat_fields[23] + .parse() + .map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse RSS from /proc/{}/stat: {}", pid, e), + })?; + + // Convert pages to MB (assuming 4KB pages) + let memory_mb = (rss_pages * 4) as f32 / 1024.0; + + // For CPU, we'd need to track over time - simplified to 0 for now + // TODO: Implement proper CPU percentage calculation + let cpu_percent = 0.0; + + Ok((memory_mb, cpu_percent)) + } + + async fn get_service_disk_usage(&self, service: &str) -> Result { + // For systemd services, check if they have private /var directories or specific data paths + // This is a simplified implementation - could be enhanced to check actual service-specific paths + + // Common service data directories to check + let potential_paths = vec![ + format!("/var/lib/{}", service), + format!("/var/cache/{}", service), + format!("/var/log/{}", service), + format!("/opt/{}", service), + format!("/srv/{}", service), + ]; + + let mut total_usage = 0.0; + + for path in potential_paths { + if let Ok(usage) = self.get_directory_size(&path).await { + total_usage += usage; + } + } + + Ok(total_usage) + } + + async fn get_directory_size(&self, path: &str) -> Result { + let output = Command::new("du") + .args(["-s", "-k", path]) // Use kilobytes instead of forcing GB + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: format!("du -s -k {}", path), + message: e.to_string(), + })?; + + if !output.status.success() { + // Directory doesn't exist or permission denied - return 0 + return Ok(0.0); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + if let Some(line) = stdout.lines().next() { + if let Some(size_str) = line.split_whitespace().next() { + let size_kb = size_str.parse::().unwrap_or(0.0); + let size_gb = size_kb / (1024.0 * 1024.0); // Convert KB to GB + return Ok(size_gb); + } + } + + Ok(0.0) + } + + async fn get_service_memory_limit(&self, service: &str) -> Result { + let output = Command::new("systemctl") + .args(["show", service, "--property=MemoryMax"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: format!("systemctl show {} --property=MemoryMax", service), + message: e.to_string(), + })?; + + let stdout = String::from_utf8_lossy(&output.stdout); + for line in stdout.lines() { + if let Some(value) = line.strip_prefix("MemoryMax=") { + if value == "infinity" { + return Ok(0.0); // No limit + } + if let Ok(bytes) = value.parse::() { + return Ok(bytes as f32 / (1024.0 * 1024.0)); // Convert to MB + } + } + } + + Ok(0.0) // No limit or couldn't parse + } + + async fn get_system_memory_info(&self) -> Result { + let meminfo = + fs::read_to_string("/proc/meminfo") + .await + .map_err(|e| CollectorError::IoError { + message: e.to_string(), + })?; + + let mut memory_info = HashMap::new(); + for line in meminfo.lines() { + if let Some((key, value)) = line.split_once(':') { + let value = value.trim().trim_end_matches(" kB"); + if let Ok(kb) = value.parse::() { + memory_info.insert(key.to_string(), kb); + } + } + } + + let total_kb = memory_info.get("MemTotal").copied().unwrap_or(0); + let available_kb = memory_info.get("MemAvailable").copied().unwrap_or(0); + let used_kb = total_kb.saturating_sub(available_kb); + + Ok(SystemMemoryInfo { + total_mb: total_kb as f32 / 1024.0, + used_mb: used_kb as f32 / 1024.0, + }) + } + + async fn get_disk_usage(&self) -> Result { + let output = Command::new("df") + .args(["-BG", "--output=size,used,avail", "/"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: "df -BG --output=size,used,avail /".to_string(), + message: e.to_string(), + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(CollectorError::CommandFailed { + command: "df -BG --output=size,used,avail /".to_string(), + message: stderr.to_string(), + }); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let lines: Vec<&str> = stdout.lines().collect(); + + if lines.len() < 2 { + return Err(CollectorError::ParseError { + message: "Unexpected df output format".to_string(), + }); + } + + let data_line = lines[1].trim(); + let parts: Vec<&str> = data_line.split_whitespace().collect(); + if parts.len() < 3 { + return Err(CollectorError::ParseError { + message: format!("Unexpected df data format: {}", data_line), + }); + } + + let parse_size = |s: &str| -> Result { + s.trim_end_matches('G') + .parse::() + .map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse disk size '{}': {}", s, e), + }) + }; + + Ok(DiskUsage { + total_gb: parse_size(parts[0])?, + used_gb: parse_size(parts[1])?, + }) + } + + async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> { + let loadavg = + fs::read_to_string("/proc/loadavg") + .await + .map_err(|e| CollectorError::IoError { + message: e.to_string(), + })?; + + let parts: Vec<&str> = loadavg.split_whitespace().collect(); + if parts.len() < 3 { + return Err(CollectorError::ParseError { + message: "Unexpected /proc/loadavg format".to_string(), + }); + } + + let parse = |s: &str| -> Result { + s.parse::().map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse load average '{}': {}", s, e), + }) + }; + + Ok((parse(parts[0])?, parse(parts[1])?, parse(parts[2])?)) + } + + async fn get_cpu_frequency_mhz(&self) -> Option { + let candidates = [ + "/sys/devices/system/cpu/cpufreq/policy0/scaling_cur_freq", + "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq", + ]; + + for path in candidates { + if let Ok(content) = fs::read_to_string(path).await { + if let Ok(khz) = content.trim().parse::() { + if khz > 0.0 { + return Some(khz / 1000.0); + } + } + } + } + + if let Ok(content) = fs::read_to_string("/proc/cpuinfo").await { + for line in content.lines() { + if let Some(rest) = line.strip_prefix("cpu MHz") { + if let Some(value) = rest.split(':').nth(1) { + if let Ok(mhz) = value.trim().parse::() { + if mhz > 0.0 { + return Some(mhz); + } + } + } + } + } + } + + None + } + + async fn get_cpu_temperature_c(&self) -> Option { + let mut entries = fs::read_dir("/sys/class/thermal").await.ok()?; + let mut fallback: Option = None; + + while let Ok(Some(entry)) = entries.next_entry().await { + let path = entry.path(); + let type_path = path.join("type"); + let temp_path = path.join("temp"); + + let label = fs::read_to_string(&type_path).await.ok()?.to_lowercase(); + let raw = match fs::read_to_string(&temp_path).await { + Ok(value) => value, + Err(_) => continue, + }; + + let milli: f32 = match raw.trim().parse() { + Ok(value) => value, + Err(_) => continue, + }; + + let temp_c = milli / 1000.0; + if label.contains("cpu") || label.contains("pkg") { + if temp_c > 0.0 { + return Some(temp_c); + } + } + + if fallback.is_none() && temp_c > 0.0 { + fallback = Some(temp_c); + } + } + + fallback + } + + async fn get_gpu_metrics(&self) -> (Option, Option) { + let output = Command::new("nvidia-smi") + .args([ + "--query-gpu=utilization.gpu,temperature.gpu", + "--format=csv,noheader,nounits", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await; + + match output { + Ok(result) if result.status.success() => { + let stdout = String::from_utf8_lossy(&result.stdout); + if let Some(line) = stdout.lines().next() { + let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect(); + if parts.len() >= 2 { + let load = parts[0].parse::().ok(); + let temp = parts[1].parse::().ok(); + return (load, temp); + } + } + (None, None) + } + Ok(_) | Err(_) => { + let util_output = Command::new("/opt/vc/bin/vcgencmd") + .arg("measure_temp") + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await; + + if let Ok(result) = util_output { + if result.status.success() { + let stdout = String::from_utf8_lossy(&result.stdout); + if let Some(value) = stdout + .trim() + .strip_prefix("temp=") + .and_then(|s| s.strip_suffix("'C")) + { + if let Ok(temp_c) = value.parse::() { + return (None, Some(temp_c)); + } + } + } + } + + (None, None) + } + } + } +} + +#[async_trait] +impl Collector for ServiceCollector { + fn name(&self) -> &str { + "service" + } + + fn agent_type(&self) -> AgentType { + AgentType::Service + } + + fn collect_interval(&self) -> Duration { + self.interval + } + + fn is_enabled(&self) -> bool { + self.enabled + } + + fn requires_root(&self) -> bool { + false // Most systemctl commands work without root + } + + async fn collect(&self) -> Result { + let mut services = Vec::new(); + let mut healthy = 0; + let mut degraded = 0; + let mut failed = 0; + let mut total_memory_used = 0.0; + let mut total_memory_quota = 0.0; + let mut total_disk_used = 0.0; + + // Collect data from all configured services + for service in &self.services { + match self.get_service_status(service).await { + Ok(service_data) => { + match service_data.status { + ServiceStatus::Running => healthy += 1, + ServiceStatus::Degraded | ServiceStatus::Restarting => degraded += 1, + ServiceStatus::Stopped => failed += 1, + } + + total_memory_used += service_data.memory_used_mb; + if service_data.memory_quota_mb > 0.0 { + total_memory_quota += service_data.memory_quota_mb; + } + total_disk_used += service_data.disk_used_gb; + + services.push(service_data); + } + Err(e) => { + failed += 1; + // Add a placeholder service entry for failed collection + services.push(ServiceData { + name: service.clone(), + status: ServiceStatus::Stopped, + memory_used_mb: 0.0, + memory_quota_mb: 0.0, + cpu_percent: 0.0, + sandbox_limit: None, + disk_used_gb: 0.0, + }); + tracing::warn!("Failed to collect metrics for service {}: {}", service, e); + } + } + } + + // Get system memory info for quota calculation + let system_memory = self + .get_system_memory_info() + .await + .unwrap_or(SystemMemoryInfo { + total_mb: 0.0, + used_mb: 0.0, + }); + + let _disk_usage = self.get_disk_usage().await.unwrap_or(DiskUsage { + total_gb: 0.0, + used_gb: 0.0, + }); + + let (cpu_load_1, cpu_load_5, cpu_load_15) = + self.get_cpu_load().await.unwrap_or((0.0, 0.0, 0.0)); + let cpu_freq_mhz = self.get_cpu_frequency_mhz().await; + let cpu_temp_c = self.get_cpu_temperature_c().await; + let (gpu_load_percent, gpu_temp_c) = self.get_gpu_metrics().await; + + // If no specific quotas are set, use system memory as reference + if total_memory_quota == 0.0 { + total_memory_quota = system_memory.total_mb; + } + + let service_metrics = json!({ + "summary": { + "healthy": healthy, + "degraded": degraded, + "failed": failed, + "memory_used_mb": total_memory_used, + "memory_quota_mb": total_memory_quota, + "system_memory_used_mb": system_memory.used_mb, + "system_memory_total_mb": system_memory.total_mb, + "disk_used_gb": total_disk_used, + "disk_total_gb": total_disk_used, // For services, total = used (no quota concept) + "cpu_load_1": cpu_load_1, + "cpu_load_5": cpu_load_5, + "cpu_load_15": cpu_load_15, + "cpu_freq_mhz": cpu_freq_mhz, + "cpu_temp_c": cpu_temp_c, + "gpu_load_percent": gpu_load_percent, + "gpu_temp_c": gpu_temp_c, + }, + "services": services, + "timestamp": Utc::now() + }); + + Ok(CollectorOutput { + agent_type: AgentType::Service, + data: service_metrics, + timestamp: Utc::now(), + }) + } +} + +#[derive(Debug, Clone, Serialize)] +struct ServiceData { + name: String, + status: ServiceStatus, + memory_used_mb: f32, + memory_quota_mb: f32, + cpu_percent: f32, + sandbox_limit: Option, + disk_used_gb: f32, +} + +#[derive(Debug, Clone, Serialize)] +enum ServiceStatus { + Running, + Degraded, + Restarting, + Stopped, +} + +struct SystemMemoryInfo { + total_mb: f32, + used_mb: f32, +} + +#[allow(dead_code)] +struct DiskUsage { + total_gb: f32, + used_gb: f32, +} diff --git a/agent/src/collectors/smart.rs b/agent/src/collectors/smart.rs new file mode 100644 index 0000000..8864864 --- /dev/null +++ b/agent/src/collectors/smart.rs @@ -0,0 +1,447 @@ +use async_trait::async_trait; +use chrono::Utc; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use std::io::ErrorKind; +use std::process::Stdio; +use std::time::Duration; +use tokio::process::Command; +use tokio::time::timeout; + +use super::{AgentType, Collector, CollectorError, CollectorOutput}; + +#[derive(Debug, Clone)] +pub struct SmartCollector { + pub enabled: bool, + pub interval: Duration, + pub devices: Vec, + pub timeout_ms: u64, +} + +impl SmartCollector { + pub fn new(enabled: bool, interval_ms: u64, devices: Vec) -> Self { + Self { + enabled, + interval: Duration::from_millis(interval_ms), + devices, + timeout_ms: 30000, // 30 second timeout for smartctl + } + } + + async fn get_smart_data(&self, device: &str) -> Result { + let timeout_duration = Duration::from_millis(self.timeout_ms); + + let command_result = timeout( + timeout_duration, + Command::new("smartctl") + .args(["-a", "-j", &format!("/dev/{}", device)]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output(), + ) + .await + .map_err(|_| CollectorError::Timeout { + duration_ms: self.timeout_ms, + })?; + + let output = command_result.map_err(|e| match e.kind() { + ErrorKind::NotFound => CollectorError::ExternalDependency { + dependency: "smartctl".to_string(), + message: e.to_string(), + }, + ErrorKind::PermissionDenied => CollectorError::PermissionDenied { + message: e.to_string(), + }, + _ => CollectorError::CommandFailed { + command: format!("smartctl -a -j /dev/{}", device), + message: e.to_string(), + }, + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + let stderr_lower = stderr.to_lowercase(); + + if stderr_lower.contains("permission denied") { + return Err(CollectorError::PermissionDenied { + message: stderr.to_string(), + }); + } + + if stderr_lower.contains("no such device") || stderr_lower.contains("cannot open") { + return Err(CollectorError::DeviceNotFound { + device: device.to_string(), + }); + } + + return Err(CollectorError::CommandFailed { + command: format!("smartctl -a -j /dev/{}", device), + message: stderr.to_string(), + }); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let smart_output: SmartCtlOutput = + serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse smartctl output for {}: {}", device, e), + })?; + + Ok(SmartDeviceData::from_smartctl_output(device, smart_output)) + } + + async fn get_drive_usage(&self, device: &str) -> Result<(Option, Option), CollectorError> { + // Get capacity first + let capacity = match self.get_drive_capacity(device).await { + Ok(cap) => Some(cap), + Err(_) => None, + }; + + // Try to get usage information + // For simplicity, we'll use the root filesystem usage for now + // In the future, this could be enhanced to map drives to specific mount points + let usage = if device.contains("nvme0n1") || device.contains("sda") { + // This is likely the main system drive, use root filesystem usage + match self.get_disk_usage().await { + Ok(disk_usage) => Some(disk_usage.used_gb), + Err(_) => None, + } + } else { + // For other drives, we don't have usage info yet + None + }; + + Ok((capacity, usage)) + } + + async fn get_drive_capacity(&self, device: &str) -> Result { + let output = Command::new("lsblk") + .args(["-J", "-o", "NAME,SIZE", &format!("/dev/{}", device)]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: format!("lsblk -J -o NAME,SIZE /dev/{}", device), + message: e.to_string(), + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(CollectorError::CommandFailed { + command: format!("lsblk -J -o NAME,SIZE /dev/{}", device), + message: stderr.to_string(), + }); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let lsblk_output: serde_json::Value = serde_json::from_str(&stdout) + .map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse lsblk JSON: {}", e), + })?; + + // Extract size from the first blockdevice + if let Some(blockdevices) = lsblk_output["blockdevices"].as_array() { + if let Some(device_info) = blockdevices.first() { + if let Some(size_str) = device_info["size"].as_str() { + return self.parse_lsblk_size(size_str); + } + } + } + + Err(CollectorError::ParseError { + message: format!("No size information found for device {}", device), + }) + } + + fn parse_lsblk_size(&self, size_str: &str) -> Result { + // Parse sizes like "953,9G", "1T", "512M" + let size_str = size_str.replace(',', "."); // Handle European decimal separator + + if let Some(pos) = size_str.find(|c: char| c.is_alphabetic()) { + let (number_part, unit_part) = size_str.split_at(pos); + let number: f32 = number_part.parse() + .map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse size number '{}': {}", number_part, e), + })?; + + let multiplier = match unit_part.to_uppercase().as_str() { + "T" | "TB" => 1024.0, + "G" | "GB" => 1.0, + "M" | "MB" => 1.0 / 1024.0, + "K" | "KB" => 1.0 / (1024.0 * 1024.0), + _ => return Err(CollectorError::ParseError { + message: format!("Unknown size unit: {}", unit_part), + }), + }; + + Ok(number * multiplier) + } else { + Err(CollectorError::ParseError { + message: format!("Invalid size format: {}", size_str), + }) + } + } + + async fn get_disk_usage(&self) -> Result { + let output = Command::new("df") + .args(["-BG", "--output=size,used,avail", "/"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: "df -BG --output=size,used,avail /".to_string(), + message: e.to_string(), + })?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(CollectorError::CommandFailed { + command: "df -BG --output=size,used,avail /".to_string(), + message: stderr.to_string(), + }); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let lines: Vec<&str> = stdout.lines().collect(); + + if lines.len() < 2 { + return Err(CollectorError::ParseError { + message: "Unexpected df output format".to_string(), + }); + } + + // Skip header line, parse data line + let data_line = lines[1].trim(); + let parts: Vec<&str> = data_line.split_whitespace().collect(); + + if parts.len() < 3 { + return Err(CollectorError::ParseError { + message: format!("Unexpected df data format: {}", data_line), + }); + } + + let parse_size = |s: &str| -> Result { + s.trim_end_matches('G') + .parse::() + .map_err(|e| CollectorError::ParseError { + message: format!("Failed to parse disk size '{}': {}", s, e), + }) + }; + + Ok(DiskUsage { + total_gb: parse_size(parts[0])?, + used_gb: parse_size(parts[1])?, + available_gb: parse_size(parts[2])?, + }) + } +} + +#[async_trait] +impl Collector for SmartCollector { + fn name(&self) -> &str { + "smart" + } + + fn agent_type(&self) -> AgentType { + AgentType::Smart + } + + fn collect_interval(&self) -> Duration { + self.interval + } + + fn is_enabled(&self) -> bool { + self.enabled + } + + fn requires_root(&self) -> bool { + true // smartctl typically requires root access + } + + async fn collect(&self) -> Result { + let mut drives = Vec::new(); + let mut issues = Vec::new(); + let mut healthy = 0; + let mut warning = 0; + let mut critical = 0; + + // Collect data from all configured devices + for device in &self.devices { + match self.get_smart_data(device).await { + Ok(mut drive_data) => { + // Try to get capacity and usage for this drive + if let Ok((capacity, usage)) = self.get_drive_usage(device).await { + drive_data.capacity_gb = capacity; + drive_data.used_gb = usage; + } + match drive_data.health_status.as_str() { + "PASSED" => healthy += 1, + "FAILED" => { + critical += 1; + issues.push(format!("{}: SMART status FAILED", device)); + } + _ => { + warning += 1; + issues.push(format!("{}: Unknown SMART status", device)); + } + } + drives.push(drive_data); + } + Err(e) => { + warning += 1; + issues.push(format!("{}: {}", device, e)); + } + } + } + + // Get disk usage information + let disk_usage = self.get_disk_usage().await?; + + let status = if critical > 0 { + "CRITICAL" + } else if warning > 0 { + "WARNING" + } else { + "HEALTHY" + }; + + let smart_metrics = json!({ + "status": status, + "drives": drives, + "summary": { + "healthy": healthy, + "warning": warning, + "critical": critical, + "capacity_total_gb": disk_usage.total_gb, + "capacity_used_gb": disk_usage.used_gb, + "capacity_available_gb": disk_usage.available_gb + }, + "issues": issues, + "timestamp": Utc::now() + }); + + Ok(CollectorOutput { + agent_type: AgentType::Smart, + data: smart_metrics, + timestamp: Utc::now(), + }) + } +} + +#[derive(Debug, Clone, Serialize)] +struct SmartDeviceData { + name: String, + temperature_c: f32, + wear_level: f32, + power_on_hours: u64, + available_spare: f32, + health_status: String, + capacity_gb: Option, + used_gb: Option, +} + +impl SmartDeviceData { + fn from_smartctl_output(device: &str, output: SmartCtlOutput) -> Self { + let temperature_c = output.temperature.and_then(|t| t.current).unwrap_or(0.0); + + let wear_level = output + .nvme_smart_health_information_log + .as_ref() + .and_then(|nvme| nvme.percentage_used) + .unwrap_or(0.0); + + let power_on_hours = output.power_on_time.and_then(|p| p.hours).unwrap_or(0); + + let available_spare = output + .nvme_smart_health_information_log + .as_ref() + .and_then(|nvme| nvme.available_spare) + .unwrap_or(100.0); + + let health_status = output + .smart_status + .and_then(|s| s.passed) + .map(|passed| { + if passed { + "PASSED".to_string() + } else { + "FAILED".to_string() + } + }) + .unwrap_or_else(|| "UNKNOWN".to_string()); + + Self { + name: device.to_string(), + temperature_c, + wear_level, + power_on_hours, + available_spare, + health_status, + capacity_gb: None, // Will be set later by the collector + used_gb: None, // Will be set later by the collector + } + } +} + +#[derive(Debug, Clone)] +struct DiskUsage { + total_gb: f32, + used_gb: f32, + available_gb: f32, +} + +// Minimal smartctl JSON output structure - only the fields we need +#[derive(Debug, Deserialize)] +struct SmartCtlOutput { + temperature: Option, + power_on_time: Option, + smart_status: Option, + nvme_smart_health_information_log: Option, +} + +#[derive(Debug, Deserialize)] +struct Temperature { + current: Option, +} + +#[derive(Debug, Deserialize)] +struct PowerOnTime { + hours: Option, +} + +#[derive(Debug, Deserialize)] +struct SmartStatus { + passed: Option, +} + +#[derive(Debug, Deserialize)] +struct NvmeSmartLog { + percentage_used: Option, + available_spare: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_lsblk_size() { + let collector = SmartCollector::new(true, 5000, vec![]); + + // Test gigabyte sizes + assert!((collector.parse_lsblk_size("953,9G").unwrap() - 953.9).abs() < 0.1); + assert!((collector.parse_lsblk_size("1G").unwrap() - 1.0).abs() < 0.1); + + // Test terabyte sizes + assert!((collector.parse_lsblk_size("1T").unwrap() - 1024.0).abs() < 0.1); + assert!((collector.parse_lsblk_size("2,5T").unwrap() - 2560.0).abs() < 0.1); + + // Test megabyte sizes + assert!((collector.parse_lsblk_size("512M").unwrap() - 0.5).abs() < 0.1); + + // Test error cases + assert!(collector.parse_lsblk_size("invalid").is_err()); + assert!(collector.parse_lsblk_size("1X").is_err()); + } +} diff --git a/agent/src/config.rs b/agent/src/config.rs new file mode 100644 index 0000000..ec2209a --- /dev/null +++ b/agent/src/config.rs @@ -0,0 +1,315 @@ +use serde::{Deserialize, Serialize}; +use std::path::Path; +use tokio::fs; +use tracing::info; + +use crate::collectors::CollectorError; +use crate::discovery::AutoDiscovery; + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct AgentConfig { + pub agent: AgentSettings, + pub zmq: ZmqSettings, + pub collectors: CollectorsConfig, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct AgentSettings { + pub hostname: String, + pub log_level: String, + pub metrics_buffer_size: usize, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct ZmqSettings { + pub port: u16, + pub bind_address: String, + pub send_timeout_ms: u64, + pub receive_timeout_ms: u64, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct CollectorsConfig { + pub smart: SmartCollectorConfig, + pub service: ServiceCollectorConfig, + pub backup: BackupCollectorConfig, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct SmartCollectorConfig { + pub enabled: bool, + pub interval_ms: u64, + pub devices: Vec, + pub timeout_ms: u64, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct ServiceCollectorConfig { + pub enabled: bool, + pub interval_ms: u64, + pub services: Vec, + pub timeout_ms: u64, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct BackupCollectorConfig { + pub enabled: bool, + pub interval_ms: u64, + pub restic_repo: Option, + pub backup_service: String, + pub timeout_ms: u64, +} + +impl Default for AgentConfig { + fn default() -> Self { + Self { + agent: AgentSettings { + hostname: gethostname::gethostname().to_string_lossy().to_string(), + log_level: "info".to_string(), + metrics_buffer_size: 1000, + }, + zmq: ZmqSettings { + port: 6130, + bind_address: "0.0.0.0".to_string(), + send_timeout_ms: 5000, + receive_timeout_ms: 5000, + }, + collectors: CollectorsConfig { + smart: SmartCollectorConfig { + enabled: true, + interval_ms: 5000, + devices: vec!["nvme0n1".to_string()], + timeout_ms: 30000, + }, + service: ServiceCollectorConfig { + enabled: true, + interval_ms: 2000, + services: vec![ + "gitea".to_string(), + "immich".to_string(), + "vaultwarden".to_string(), + "unifi".to_string(), + ], + timeout_ms: 10000, + }, + backup: BackupCollectorConfig { + enabled: true, + interval_ms: 30000, + restic_repo: None, + backup_service: "restic-backup".to_string(), + timeout_ms: 30000, + }, + }, + } + } +} + +impl AgentConfig { + pub async fn load_from_file>(path: P) -> Result { + let content = fs::read_to_string(path) + .await + .map_err(|e| CollectorError::ConfigError { + message: format!("Failed to read config file: {}", e), + })?; + + let config: AgentConfig = + toml::from_str(&content).map_err(|e| CollectorError::ConfigError { + message: format!("Failed to parse config file: {}", e), + })?; + + config.validate()?; + Ok(config) + } + + pub async fn save_to_file>(&self, path: P) -> Result<(), CollectorError> { + let content = toml::to_string_pretty(self).map_err(|e| CollectorError::ConfigError { + message: format!("Failed to serialize config: {}", e), + })?; + + fs::write(path, content) + .await + .map_err(|e| CollectorError::ConfigError { + message: format!("Failed to write config file: {}", e), + })?; + + Ok(()) + } + + pub fn validate(&self) -> Result<(), CollectorError> { + // Validate ZMQ settings + if self.zmq.port == 0 { + return Err(CollectorError::ConfigError { + message: "ZMQ port cannot be 0".to_string(), + }); + } + + // Validate collector intervals + if self.collectors.smart.enabled && self.collectors.smart.interval_ms < 1000 { + return Err(CollectorError::ConfigError { + message: "SMART collector interval must be at least 1000ms".to_string(), + }); + } + + if self.collectors.service.enabled && self.collectors.service.interval_ms < 500 { + return Err(CollectorError::ConfigError { + message: "Service collector interval must be at least 500ms".to_string(), + }); + } + + if self.collectors.backup.enabled && self.collectors.backup.interval_ms < 5000 { + return Err(CollectorError::ConfigError { + message: "Backup collector interval must be at least 5000ms".to_string(), + }); + } + + // Validate smart devices + if self.collectors.smart.enabled && self.collectors.smart.devices.is_empty() { + return Err(CollectorError::ConfigError { + message: "SMART collector requires at least one device".to_string(), + }); + } + + // Validate services + if self.collectors.service.enabled && self.collectors.service.services.is_empty() { + return Err(CollectorError::ConfigError { + message: "Service collector requires at least one service".to_string(), + }); + } + + // Validate backup configuration + if self.collectors.backup.enabled { + if self.collectors.backup.restic_repo.is_none() { + tracing::warn!("Backup collector enabled but no restic repository configured"); + } + if self.collectors.backup.backup_service.is_empty() { + return Err(CollectorError::ConfigError { + message: "Backup collector requires a backup service name".to_string(), + }); + } + } + + Ok(()) + } + + pub fn get_enabled_collector_count(&self) -> usize { + let mut count = 0; + if self.collectors.smart.enabled { + count += 1; + } + if self.collectors.service.enabled { + count += 1; + } + if self.collectors.backup.enabled { + count += 1; + } + count + } + + pub async fn auto_configure(&mut self) -> Result<(), CollectorError> { + let hostname = &self.agent.hostname.clone(); + info!("Auto-configuring agent for host: {}", hostname); + + // Auto-detect storage devices + let devices = AutoDiscovery::discover_storage_devices().await; + let valid_devices = AutoDiscovery::validate_devices(&devices).await; + + if !valid_devices.is_empty() { + self.collectors.smart.devices = valid_devices; + info!( + "Auto-detected storage devices: {:?}", + self.collectors.smart.devices + ); + } else { + info!("No accessible storage devices found, disabling SMART collector"); + self.collectors.smart.enabled = false; + } + + // Auto-detect services + let services = AutoDiscovery::discover_services().await; + if !services.is_empty() { + self.collectors.service.services = services; + info!( + "Auto-detected services: {:?}", + self.collectors.service.services + ); + } else { + info!("No monitorable services found, using minimal service list"); + self.collectors.service.services = vec!["ssh".to_string()]; + } + + // Auto-detect backup configuration + let (backup_enabled, restic_repo, backup_service) = + AutoDiscovery::discover_backup_config(hostname).await; + + self.collectors.backup.enabled = backup_enabled; + self.collectors.backup.restic_repo = restic_repo; + self.collectors.backup.backup_service = backup_service; + + if backup_enabled { + info!( + "Auto-configured backup monitoring: repo={:?}, service={}", + self.collectors.backup.restic_repo, self.collectors.backup.backup_service + ); + } else { + info!("Backup monitoring disabled for this host"); + } + + // Apply host-specific timing optimizations + self.apply_host_timing_overrides(hostname); + + Ok(()) + } + + fn apply_host_timing_overrides(&mut self, hostname: &str) { + match hostname { + "srv01" => { + // Server host - more frequent monitoring + self.collectors.service.interval_ms = 1000; + self.collectors.smart.interval_ms = 5000; + } + "cmbox" | "labbox" | "simonbox" | "steambox" => { + // Workstation hosts - less frequent monitoring + self.collectors.smart.interval_ms = 10000; + self.collectors.service.interval_ms = 5000; + } + _ => { + // Unknown host - conservative defaults + self.collectors.smart.interval_ms = 10000; + self.collectors.service.interval_ms = 5000; + } + } + + info!( + "Applied timing overrides for {}: smart={}ms, service={}ms", + hostname, self.collectors.smart.interval_ms, self.collectors.service.interval_ms + ); + } + + pub fn summary(&self) -> String { + let mut parts = Vec::new(); + + if self.collectors.smart.enabled { + parts.push(format!( + "SMART({} devices)", + self.collectors.smart.devices.len() + )); + } + + if self.collectors.service.enabled { + parts.push(format!( + "Services({} monitored)", + self.collectors.service.services.len() + )); + } + + if self.collectors.backup.enabled { + parts.push("Backup".to_string()); + } + + if parts.is_empty() { + "No collectors enabled".to_string() + } else { + parts.join(", ") + } + } +} diff --git a/agent/src/discovery.rs b/agent/src/discovery.rs new file mode 100644 index 0000000..62a777a --- /dev/null +++ b/agent/src/discovery.rs @@ -0,0 +1,449 @@ +use std::collections::HashSet; +use std::process::Stdio; +use tokio::fs; +use tokio::process::Command; +use tracing::{debug, warn}; + +use crate::collectors::CollectorError; + +pub struct AutoDiscovery; + +impl AutoDiscovery { + /// Auto-detect storage devices suitable for SMART monitoring + pub async fn discover_storage_devices() -> Vec { + let mut devices = Vec::new(); + + // Method 1: Try lsblk to find block devices + if let Ok(lsblk_devices) = Self::discover_via_lsblk().await { + devices.extend(lsblk_devices); + } + + // Method 2: Scan /dev for common device patterns + if devices.is_empty() { + if let Ok(dev_devices) = Self::discover_via_dev_scan().await { + devices.extend(dev_devices); + } + } + + // Method 3: Fallback to common device names + if devices.is_empty() { + devices = Self::fallback_device_names(); + } + + // Remove duplicates and sort + let mut unique_devices: Vec = devices + .into_iter() + .collect::>() + .into_iter() + .collect(); + unique_devices.sort(); + + debug!("Auto-detected storage devices: {:?}", unique_devices); + unique_devices + } + + async fn discover_via_lsblk() -> Result, CollectorError> { + let output = Command::new("lsblk") + .args(["-d", "-o", "NAME,TYPE", "-n", "-r"]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: "lsblk".to_string(), + message: e.to_string(), + })?; + + if !output.status.success() { + return Err(CollectorError::CommandFailed { + command: "lsblk".to_string(), + message: String::from_utf8_lossy(&output.stderr).to_string(), + }); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let mut devices = Vec::new(); + + for line in stdout.lines() { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 { + let device_name = parts[0]; + let device_type = parts[1]; + + // Include disk type devices and filter out unwanted ones + if device_type == "disk" && Self::is_suitable_device(device_name) { + devices.push(device_name.to_string()); + } + } + } + + Ok(devices) + } + + async fn discover_via_dev_scan() -> Result, CollectorError> { + let mut devices = Vec::new(); + + // Read /dev directory + let mut dev_entries = fs::read_dir("/dev") + .await + .map_err(|e| CollectorError::IoError { + message: e.to_string(), + })?; + + while let Some(entry) = + dev_entries + .next_entry() + .await + .map_err(|e| CollectorError::IoError { + message: e.to_string(), + })? + { + let file_name = entry.file_name(); + let device_name = file_name.to_string_lossy(); + + if Self::is_suitable_device(&device_name) { + devices.push(device_name.to_string()); + } + } + + Ok(devices) + } + + fn is_suitable_device(device_name: &str) -> bool { + // Include NVMe, SATA, and other storage devices + // Exclude partitions, loop devices, etc. + (device_name.starts_with("nvme") && device_name.contains("n") && !device_name.contains("p")) || + (device_name.starts_with("sd") && device_name.len() == 3) || // sda, sdb, etc. not sda1 + (device_name.starts_with("hd") && device_name.len() == 3) || // hda, hdb, etc. + (device_name.starts_with("vd") && device_name.len() == 3) // vda, vdb for VMs + } + + fn fallback_device_names() -> Vec { + vec!["nvme0n1".to_string(), "sda".to_string(), "sdb".to_string()] + } + + /// Auto-detect systemd services suitable for monitoring + pub async fn discover_services() -> Vec { + let mut services = Vec::new(); + + // Method 1: Try to find running services + if let Ok(running_services) = Self::discover_running_services().await { + services.extend(running_services); + } + + // Method 2: Add host-specific services based on hostname + let hostname = gethostname::gethostname().to_string_lossy().to_string(); + services.extend(Self::get_host_specific_services(&hostname)); + + // Normalize aliases and verify the units actually exist before deduping + let canonicalized: Vec = services + .into_iter() + .filter_map(|svc| Self::canonical_service_name(&svc)) + .collect(); + + let existing = Self::filter_existing_services(&canonicalized).await; + + let mut unique_services: Vec = existing + .into_iter() + .collect::>() + .into_iter() + .collect(); + unique_services.sort(); + + debug!("Auto-detected services: {:?}", unique_services); + unique_services + } + + async fn discover_running_services() -> Result, CollectorError> { + let output = Command::new("systemctl") + .args([ + "list-units", + "--type=service", + "--state=active", + "--no-pager", + "--no-legend", + ]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: "systemctl list-units".to_string(), + message: e.to_string(), + })?; + + if !output.status.success() { + return Err(CollectorError::CommandFailed { + command: "systemctl list-units".to_string(), + message: String::from_utf8_lossy(&output.stderr).to_string(), + }); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let mut services = Vec::new(); + + for line in stdout.lines() { + let parts: Vec<&str> = line.split_whitespace().collect(); + if !parts.is_empty() { + let service_name = parts[0]; + // Remove .service suffix if present + let clean_name = service_name + .strip_suffix(".service") + .unwrap_or(service_name); + + // Only include services we're interested in monitoring + if Self::is_monitorable_service(clean_name) { + services.push(clean_name.to_string()); + } + } + } + + Ok(services) + } + + fn is_monitorable_service(service_name: &str) -> bool { + // Define patterns for services we want to monitor + let interesting_services = [ + // Web applications + "gitea", + "immich", + "vaultwarden", + "unifi", + "wordpress", + "nginx", + "apache2", + "httpd", + "caddy", + // Databases + "postgresql", + "mysql", + "mariadb", + "redis", + "mongodb", + // Monitoring and infrastructure + "smart-metrics-api", + "service-metrics-api", + "backup-metrics-api", + "prometheus", + "grafana", + "influxdb", + // Backup and storage + "restic", + "borg", + "rclone", + "syncthing", + // Container runtimes + "docker", + "podman", + "containerd", + // Network services + "sshd", + "dnsmasq", + "bind9", + "pihole", + // Media services + "plex", + "jellyfin", + "emby", + "sonarr", + "radarr", + ]; + + // Check if service name contains any of our interesting patterns + interesting_services + .iter() + .any(|&pattern| service_name.contains(pattern) || pattern.contains(service_name)) + } + + fn get_host_specific_services(hostname: &str) -> Vec { + match hostname { + "srv01" => vec![ + "gitea".to_string(), + "immich".to_string(), + "vaultwarden".to_string(), + "unifi".to_string(), + "smart-metrics-api".to_string(), + "service-metrics-api".to_string(), + "backup-metrics-api".to_string(), + ], + "cmbox" | "labbox" | "simonbox" => vec!["docker".to_string(), "sshd".to_string()], + "steambox" => vec!["steam".to_string(), "sshd".to_string()], + _ => vec!["sshd".to_string()], + } + } + + fn canonical_service_name(service: &str) -> Option { + let trimmed = service.trim(); + if trimmed.is_empty() { + return None; + } + + let lower = trimmed.to_lowercase(); + let aliases = [ + ("ssh", "sshd"), + ("sshd", "sshd"), + ("docker.service", "docker"), + ]; + + for (alias, target) in aliases { + if lower == alias { + return Some(target.to_string()); + } + } + + Some(trimmed.to_string()) + } + + async fn filter_existing_services(services: &[String]) -> Vec { + let mut existing = Vec::new(); + + for service in services { + if Self::service_exists(service).await { + existing.push(service.clone()); + } + } + + existing + } + + async fn service_exists(service: &str) -> bool { + let unit = if service.ends_with(".service") { + service.to_string() + } else { + format!("{}.service", service) + }; + + match Command::new("systemctl") + .args(["status", &unit]) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .output() + .await + { + Ok(output) => output.status.success(), + Err(error) => { + warn!("Failed to check service {}: {}", unit, error); + false + } + } + } + + /// Auto-detect backup configuration + pub async fn discover_backup_config(hostname: &str) -> (bool, Option, String) { + // Check if this host should have backup monitoring + let backup_enabled = hostname == "srv01" || Self::has_backup_service().await; + + // Try to find restic repository + let restic_repo = if backup_enabled { + Self::discover_restic_repo().await + } else { + None + }; + + // Determine backup service name + let backup_service = Self::discover_backup_service() + .await + .unwrap_or_else(|| "restic-backup".to_string()); + + (backup_enabled, restic_repo, backup_service) + } + + async fn has_backup_service() -> bool { + // Check for common backup services + let backup_services = ["restic", "borg", "duplicati", "rclone"]; + + for service in backup_services { + if let Ok(output) = Command::new("systemctl") + .args(["is-enabled", service]) + .output() + .await + { + if output.status.success() { + return true; + } + } + } + + false + } + + async fn discover_restic_repo() -> Option { + // Common restic repository locations + let common_paths = [ + "/srv/backups/restic", + "/var/backups/restic", + "/home/restic", + "/backup/restic", + "/mnt/backup/restic", + ]; + + for path in common_paths { + if fs::metadata(path).await.is_ok() { + debug!("Found restic repository at: {}", path); + return Some(path.to_string()); + } + } + + // Try to find via environment variables or config files + if let Ok(content) = fs::read_to_string("/etc/restic/repository").await { + let repo_path = content.trim(); + if !repo_path.is_empty() { + return Some(repo_path.to_string()); + } + } + + None + } + + async fn discover_backup_service() -> Option { + let backup_services = ["restic-backup", "restic", "borg-backup", "borg", "backup"]; + + for service in backup_services { + if let Ok(output) = Command::new("systemctl") + .args(["is-enabled", &format!("{}.service", service)]) + .output() + .await + { + if output.status.success() { + return Some(service.to_string()); + } + } + } + + None + } + + /// Validate auto-detected configuration + pub async fn validate_devices(devices: &[String]) -> Vec { + let mut valid_devices = Vec::new(); + + for device in devices { + if Self::can_access_device(device).await { + valid_devices.push(device.clone()); + } else { + warn!("Cannot access device {}, skipping", device); + } + } + + valid_devices + } + + async fn can_access_device(device: &str) -> bool { + let device_path = format!("/dev/{}", device); + + // Try to run smartctl to see if device is accessible + if let Ok(output) = Command::new("smartctl") + .args(["-i", &device_path]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + { + // smartctl returns 0 for success, but may return other codes for warnings + // that are still acceptable (like device supports SMART but has some issues) + output.status.code().map_or(false, |code| code <= 4) + } else { + false + } + } +} diff --git a/agent/src/main.rs b/agent/src/main.rs index 7679b79..f719bad 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -1,161 +1,182 @@ -use std::thread; -use std::time::Duration; - -use anyhow::{anyhow, Context, Result}; -use chrono::Utc; +use anyhow::{anyhow, Result}; use clap::{ArgAction, Parser}; -use cm_dashboard_shared::envelope::{AgentType, MetricsEnvelope}; -use rand::Rng; -use serde_json::json; -use tracing::info; +use std::path::PathBuf; +use tokio::signal; +use tracing::{error, info}; use tracing_subscriber::EnvFilter; -use zmq::{Context as ZmqContext, SocketType}; + +mod agent; +mod collectors; +mod config; +mod discovery; +mod scheduler; + +use agent::MetricsAgent; +use config::AgentConfig; #[derive(Parser, Debug)] #[command( name = "cm-dashboard-agent", version, - about = "CM Dashboard metrics agent" + about = "CM Dashboard ZMQ metrics agent with auto-detection" )] struct Cli { - /// Hostname to advertise in metric envelopes - #[arg(long, value_name = "HOSTNAME")] - hostname: String, + /// ZMQ port to bind to (default: 6130) + #[arg(long, value_name = "PORT")] + port: Option, - /// Bind endpoint for PUB socket (default tcp://*:6130) - #[arg(long, default_value = "tcp://*:6130", value_name = "ENDPOINT")] - bind: String, + /// Path to load configuration from + #[arg(long, value_name = "FILE")] + config: Option, - /// Publish interval in milliseconds - #[arg(long, default_value_t = 5000)] - interval_ms: u64, + /// Optional path to persist the resolved configuration + #[arg(long, value_name = "FILE")] + write_config: Option, - /// Disable smart metrics publisher + /// Disable SMART metrics collector #[arg(long, action = ArgAction::SetTrue)] disable_smart: bool, - /// Disable service metrics publisher + /// Disable service metrics collector #[arg(long, action = ArgAction::SetTrue)] disable_service: bool, - /// Disable backup metrics publisher + /// Disable backup metrics collector #[arg(long, action = ArgAction::SetTrue)] disable_backup: bool, + /// Skip auto-detection and use minimal defaults + #[arg(long, action = ArgAction::SetTrue)] + no_auto_detect: bool, + + /// Show detected configuration and exit + #[arg(long, action = ArgAction::SetTrue)] + show_config: bool, + /// Increase logging verbosity (-v, -vv) #[arg(short, long, action = ArgAction::Count)] verbose: u8, } -fn main() -> Result<()> { +#[tokio::main] +async fn main() -> Result<()> { let cli = Cli::parse(); init_tracing(cli.verbose)?; - let context = ZmqContext::new(); - let socket = context - .socket(SocketType::PUB) - .context("failed to create ZMQ PUB socket")?; - socket - .bind(&cli.bind) - .with_context(|| format!("failed to bind to {}", cli.bind))?; - info!(endpoint = %cli.bind, host = %cli.hostname, "agent started"); + // Start with file-based configuration if requested, otherwise defaults + let mut config = if let Some(path) = cli.config.as_ref() { + AgentConfig::load_from_file(path) + .await + .map_err(|e| anyhow!("Failed to load config from {}: {}", path.display(), e))? + } else { + AgentConfig::default() + }; - let interval = Duration::from_millis(cli.interval_ms.max(100)); - let mut rng = rand::thread_rng(); + // Hostname is auto-detected in AgentConfig::default() - loop { - let now = Utc::now(); - let timestamp = now.timestamp() as u64; - let timestamp_rfc3339 = now.to_rfc3339(); - - if !cli.disable_smart { - let envelope = MetricsEnvelope { - hostname: cli.hostname.clone(), - agent_type: AgentType::Smart, - timestamp, - metrics: json!({ - "status": "Healthy", - "drives": [{ - "name": "nvme0n1", - "temperature_c": rng.gen_range(30.0..60.0), - "wear_level": rng.gen_range(1.0..10.0), - "power_on_hours": rng.gen_range(1000..20000), - "available_spare": rng.gen_range(90.0..100.0) - }], - "summary": { - "healthy": 1, - "warning": 0, - "critical": 0, - "capacity_total_gb": 1024, - "capacity_used_gb": rng.gen_range(100.0..800.0) - }, - "issues": [], - "timestamp": timestamp_rfc3339 - }), - }; - publish(&socket, &envelope)?; - } - - if !cli.disable_service { - let envelope = MetricsEnvelope { - hostname: cli.hostname.clone(), - agent_type: AgentType::Service, - timestamp, - metrics: json!({ - "summary": { - "healthy": 5, - "degraded": 0, - "failed": 0, - "memory_used_mb": rng.gen_range(512.0..2048.0), - "memory_quota_mb": 4096.0 - }, - "services": [ - { - "name": "example", - "status": "Running", - "memory_used_mb": rng.gen_range(128.0..512.0), - "memory_quota_mb": 1024.0, - "cpu_percent": rng.gen_range(0.0..75.0), - "sandbox_limit": null - } - ], - "timestamp": timestamp_rfc3339 - }), - }; - publish(&socket, &envelope)?; - } - - if !cli.disable_backup { - let envelope = MetricsEnvelope { - hostname: cli.hostname.clone(), - agent_type: AgentType::Backup, - timestamp, - metrics: json!({ - "overall_status": "Healthy", - "backup": { - "last_success": timestamp_rfc3339, - "last_failure": null, - "size_gb": rng.gen_range(100.0..500.0), - "snapshot_count": rng.gen_range(10..40) - }, - "service": { - "enabled": true, - "pending_jobs": 0, - "last_message": "Backups up-to-date" - }, - "timestamp": timestamp_rfc3339 - }), - }; - publish(&socket, &envelope)?; - } - - thread::sleep(interval); + // Apply CLI port override + if let Some(port) = cli.port { + config.zmq.port = port; + } + + // Run auto-detection unless disabled + if !cli.no_auto_detect { + info!("Auto-detecting system configuration..."); + config + .auto_configure() + .await + .map_err(|e| anyhow!("Auto-detection failed: {}", e))?; + } else { + info!("Skipping auto-detection, using minimal defaults"); + } + + // Apply CLI collector overrides after auto-detection + if cli.disable_smart { + config.collectors.smart.enabled = false; + } + if cli.disable_service { + config.collectors.service.enabled = false; + } + if cli.disable_backup { + config.collectors.backup.enabled = false; + } + + if let Some(path) = cli.write_config.as_ref() { + config + .save_to_file(path) + .await + .map_err(|e| anyhow!("Failed to write config to {}: {}", path.display(), e))?; + info!("Persisted configuration to {}", path.display()); + } + + // Show configuration and exit if requested + if cli.show_config { + println!("Agent Configuration:"); + println!(" Hostname: {}", config.agent.hostname); + println!(" ZMQ Port: {}", config.zmq.port); + println!(" Collectors: {}", config.summary()); + + if config.collectors.smart.enabled { + println!(" SMART Devices: {:?}", config.collectors.smart.devices); + } + + if config.collectors.service.enabled { + println!(" Services: {:?}", config.collectors.service.services); + } + + if config.collectors.backup.enabled { + println!(" Backup Repo: {:?}", config.collectors.backup.restic_repo); + println!( + " Backup Service: {}", + config.collectors.backup.backup_service + ); + } + + return Ok(()); + } + + info!( + "Starting agent for host '{}' on port {} with: {}", + config.agent.hostname, + config.zmq.port, + config.summary() + ); + + // Build and start the agent + let mut agent = + MetricsAgent::from_config(config).map_err(|e| anyhow!("Failed to create agent: {}", e))?; + + // Set up graceful shutdown handling + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel(); + + tokio::spawn(async move { + let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate()) + .expect("Failed to install SIGTERM handler"); + let mut sigint = signal::unix::signal(signal::unix::SignalKind::interrupt()) + .expect("Failed to install SIGINT handler"); + + tokio::select! { + _ = sigterm.recv() => info!("Received SIGTERM"), + _ = sigint.recv() => info!("Received SIGINT"), + } + + let _ = shutdown_tx.send(()); + }); + + // Run the agent until shutdown + tokio::select! { + result = agent.run() => { + match result { + Ok(_) => info!("Agent completed successfully"), + Err(e) => error!("Agent error: {}", e), + } + } + _ = shutdown_rx => { + info!("Shutdown signal received"); + agent.shutdown().await; + } } -} -fn publish(socket: &zmq::Socket, envelope: &MetricsEnvelope) -> Result<()> { - let serialized = serde_json::to_vec(envelope)?; - socket.send(serialized, 0)?; Ok(()) } diff --git a/agent/src/scheduler.rs b/agent/src/scheduler.rs new file mode 100644 index 0000000..1164024 --- /dev/null +++ b/agent/src/scheduler.rs @@ -0,0 +1,393 @@ +use futures::stream::{FuturesUnordered, StreamExt}; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::{mpsc, RwLock}; +use tokio::time::{interval, Instant}; +use tracing::{debug, error, info, warn}; + +use crate::collectors::{Collector, CollectorError, CollectorOutput}; + +pub struct CollectorScheduler { + collectors: Vec>, + sender: mpsc::UnboundedSender, + receiver: mpsc::UnboundedReceiver, + stats: Arc>, + metrics_sender: Option>, +} + +#[derive(Debug)] +pub enum SchedulerEvent { + CollectionResult { + collector_name: String, + result: Result, + duration: Duration, + }, + Shutdown, +} + +#[derive(Debug, Default, Clone)] +pub struct SchedulerStats { + pub total_collections: u64, + pub successful_collections: u64, + pub failed_collections: u64, + pub collector_stats: HashMap, +} + +#[derive(Debug, Default, Clone)] +pub struct CollectorStats { + pub total_collections: u64, + pub successful_collections: u64, + pub failed_collections: u64, + pub last_success: Option, + pub last_failure: Option, + pub average_duration_ms: f64, + pub consecutive_failures: u32, +} + +impl CollectorScheduler { + pub fn new() -> Self { + let (sender, receiver) = mpsc::unbounded_channel(); + + Self { + collectors: Vec::new(), + sender, + receiver, + stats: Arc::new(RwLock::new(SchedulerStats::default())), + metrics_sender: None, + } + } + + pub fn set_metrics_sender(&mut self, sender: mpsc::UnboundedSender) { + self.metrics_sender = Some(sender); + } + + pub fn clear_metrics_sender(&mut self) { + self.metrics_sender = None; + } + + pub fn add_collector(&mut self, collector: Arc) { + if collector.is_enabled() { + info!( + "Adding collector '{}' [{}] with interval {:?}", + collector.name(), + collector.agent_type().as_str(), + collector.collect_interval() + ); + + if collector.requires_root() { + debug!("Collector '{}' is flagged as root-only", collector.name()); + } + self.collectors.push(collector); + } else { + info!("Skipping disabled collector '{}'", collector.name()); + } + } + + pub async fn start(&mut self) -> Result<(), CollectorError> { + if self.collectors.is_empty() { + return Err(CollectorError::ConfigError { + message: "No enabled collectors configured".to_string(), + }); + } + + info!( + "Starting scheduler with {} collectors", + self.collectors.len() + ); + + // Start collection tasks for each collector + let mut collection_tasks = FuturesUnordered::new(); + + for collector in self.collectors.clone() { + let sender = self.sender.clone(); + let stats = self.stats.clone(); + + let task = + tokio::spawn(async move { Self::run_collector(collector, sender, stats).await }); + + collection_tasks.push(task); + } + + // Main event loop + loop { + tokio::select! { + // Handle collection results + Some(event) = self.receiver.recv() => { + match event { + SchedulerEvent::CollectionResult { collector_name, result, duration } => { + self.handle_collection_result(&collector_name, result, duration).await; + } + SchedulerEvent::Shutdown => { + info!("Scheduler shutdown requested"); + break; + } + } + } + + // Handle task completion (shouldn't happen in normal operation) + Some(result) = collection_tasks.next() => { + match result { + Ok(_) => warn!("Collection task completed unexpectedly"), + Err(e) => error!("Collection task failed: {}", e), + } + } + + // If all tasks are done and no more events, break + else => { + warn!("All collection tasks completed, shutting down scheduler"); + break; + } + } + } + + Ok(()) + } + + async fn run_collector( + collector: Arc, + sender: mpsc::UnboundedSender, + _stats: Arc>, + ) { + let collector_name = collector.name().to_string(); + let mut interval_timer = interval(collector.collect_interval()); + interval_timer.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + + info!("Starting collection loop for '{}'", collector_name); + + loop { + interval_timer.tick().await; + + debug!("Running collection for '{}'", collector_name); + let start_time = Instant::now(); + + match collector.collect().await { + Ok(output) => { + let duration = start_time.elapsed(); + debug!( + "Collection '{}' completed in {:?}", + collector_name, duration + ); + + if let Err(e) = sender.send(SchedulerEvent::CollectionResult { + collector_name: collector_name.clone(), + result: Ok(output), + duration, + }) { + error!( + "Failed to send collection result for '{}': {}", + collector_name, e + ); + break; + } + } + Err(error) => { + let duration = start_time.elapsed(); + warn!( + "Collection '{}' failed after {:?}: {}", + collector_name, duration, error + ); + + if let Err(e) = sender.send(SchedulerEvent::CollectionResult { + collector_name: collector_name.clone(), + result: Err(error), + duration, + }) { + error!( + "Failed to send collection error for '{}': {}", + collector_name, e + ); + break; + } + } + } + } + + warn!("Collection loop for '{}' ended", collector_name); + } + + async fn handle_collection_result( + &self, + collector_name: &str, + result: Result, + duration: Duration, + ) { + let publish_output = match &result { + Ok(output) => Some(output.clone()), + Err(_) => None, + }; + + { + let mut stats = self.stats.write().await; + stats.total_collections += 1; + + match &result { + Ok(_) => { + stats.successful_collections += 1; + } + Err(_) => { + stats.failed_collections += 1; + } + } + } + + // Handle collector-specific stats + { + let mut stats = self.stats.write().await; + let duration_ms = duration.as_millis() as f64; + + let collector_stats = stats + .collector_stats + .entry(collector_name.to_string()) + .or_default(); + + collector_stats.total_collections += 1; + + if collector_stats.average_duration_ms == 0.0 { + collector_stats.average_duration_ms = duration_ms; + } else { + // Simple moving average + collector_stats.average_duration_ms = + (collector_stats.average_duration_ms * 0.9) + (duration_ms * 0.1); + } + + match &result { + Ok(output) => { + collector_stats.successful_collections += 1; + collector_stats.last_success = Some(Instant::now()); + collector_stats.consecutive_failures = 0; + + let metrics_count = match &output.data { + serde_json::Value::Object(map) => map.len(), + serde_json::Value::Array(values) => values.len(), + _ => 1, + }; + + debug!( + "Collector '{}' [{}] successful at {} ({} metrics)", + collector_name, + output.agent_type.as_str(), + output.timestamp, + metrics_count + ); + } + Err(error) => { + collector_stats.failed_collections += 1; + collector_stats.last_failure = Some(Instant::now()); + collector_stats.consecutive_failures += 1; + + warn!("Collection '{}' failed: {}", collector_name, error); + + // Log warning for consecutive failures + if collector_stats.consecutive_failures >= 5 { + error!( + "Collector '{}' has {} consecutive failures", + collector_name, collector_stats.consecutive_failures + ); + } + } + } + } + + if let (Some(sender), Some(output)) = (&self.metrics_sender, publish_output) { + if let Err(error) = sender.send(output) { + warn!("Metrics channel send error: {}", error); + } + } + } + + pub fn get_stats_handle(&self) -> Arc> { + self.stats.clone() + } + + pub async fn shutdown(&self) { + info!("Requesting scheduler shutdown"); + if let Err(e) = self.sender.send(SchedulerEvent::Shutdown) { + error!("Failed to send shutdown event: {}", e); + } + } +} + +impl Default for CollectorScheduler { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug, Clone)] +pub struct HealthChecker { + stats: Arc>, + max_consecutive_failures: u32, + max_failure_rate: f64, +} + +impl HealthChecker { + pub fn new(stats: Arc>) -> Self { + Self { + stats, + max_consecutive_failures: 10, + max_failure_rate: 0.5, // 50% failure rate threshold + } + } + + pub async fn check_health(&self) -> HealthStatus { + let stats = self.stats.read().await; + + let mut unhealthy_collectors = Vec::new(); + let mut degraded_collectors = Vec::new(); + + for (name, collector_stats) in &stats.collector_stats { + // Check consecutive failures + if collector_stats.consecutive_failures >= self.max_consecutive_failures { + unhealthy_collectors.push(name.clone()); + continue; + } + + // Check failure rate + if collector_stats.total_collections > 10 { + let failure_rate = collector_stats.failed_collections as f64 + / collector_stats.total_collections as f64; + + if failure_rate >= self.max_failure_rate { + degraded_collectors.push(name.clone()); + } + } + + // Check if collector hasn't succeeded recently + if let Some(last_success) = collector_stats.last_success { + if last_success.elapsed() > Duration::from_secs(300) { + // 5 minutes + degraded_collectors.push(name.clone()); + } + } else if collector_stats.total_collections > 5 { + // No successful collections after several attempts + unhealthy_collectors.push(name.clone()); + } + } + + if !unhealthy_collectors.is_empty() { + HealthStatus::Unhealthy { + unhealthy_collectors, + degraded_collectors, + } + } else if !degraded_collectors.is_empty() { + HealthStatus::Degraded { + degraded_collectors, + } + } else { + HealthStatus::Healthy + } + } +} + +#[derive(Debug, Clone)] +pub enum HealthStatus { + Healthy, + Degraded { + degraded_collectors: Vec, + }, + Unhealthy { + unhealthy_collectors: Vec, + degraded_collectors: Vec, + }, +} diff --git a/config/agent.example.toml b/config/agent.example.toml new file mode 100644 index 0000000..2ed28bf --- /dev/null +++ b/config/agent.example.toml @@ -0,0 +1,73 @@ +# CM Dashboard Agent Configuration +# Example configuration file for the ZMQ metrics agent + +[agent] +# Hostname to advertise in metrics (auto-detected if not specified) +hostname = "srv01" + +# Log level: trace, debug, info, warn, error +log_level = "info" + +# Maximum number of metrics to buffer before dropping +metrics_buffer_size = 1000 + +[zmq] +# ZMQ publisher port +port = 6130 + +# Bind address (0.0.0.0 for all interfaces, 127.0.0.1 for localhost only) +bind_address = "0.0.0.0" + +# ZMQ socket timeouts in milliseconds +send_timeout_ms = 5000 +receive_timeout_ms = 5000 + +[collectors.smart] +# Enable SMART metrics collection (disk health, temperature, wear) +enabled = true + +# Collection interval in milliseconds (minimum 1000ms) +interval_ms = 5000 + +# List of storage devices to monitor (without /dev/ prefix) +devices = ["nvme0n1", "sda", "sdb"] + +# Timeout for smartctl commands in milliseconds +timeout_ms = 30000 + +[collectors.service] +# Enable service metrics collection (systemd services) +enabled = true + +# Collection interval in milliseconds (minimum 500ms) +interval_ms = 2000 + +# List of systemd services to monitor +services = [ + "gitea", + "immich", + "vaultwarden", + "unifi", + "smart-metrics-api", + "service-metrics-api", + "backup-metrics-api" +] + +# Timeout for systemctl commands in milliseconds +timeout_ms = 10000 + +[collectors.backup] +# Enable backup metrics collection (restic integration) +enabled = true + +# Collection interval in milliseconds (minimum 5000ms) +interval_ms = 30000 + +# Restic repository path (leave empty to disable restic integration) +restic_repo = "/srv/backups/restic" + +# Systemd service name for backup monitoring +backup_service = "restic-backup" + +# Timeout for restic and backup commands in milliseconds +timeout_ms = 30000 \ No newline at end of file diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 4eb451d..50955ac 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -18,3 +18,4 @@ tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] } tracing-appender = "0.2" zmq = "0.10" +gethostname = "0.4" diff --git a/dashboard/config/dashboard.toml b/dashboard/config/dashboard.toml new file mode 100644 index 0000000..16fd41d --- /dev/null +++ b/dashboard/config/dashboard.toml @@ -0,0 +1,37 @@ +# CM Dashboard configuration + +[hosts] +# default_host = "srv01" + +[[hosts.hosts]] +name = "srv01" +enabled = true +# metadata = { rack = "R1" } + +[[hosts.hosts]] +name = "labbox" +enabled = true + +[dashboard] +tick_rate_ms = 250 +history_duration_minutes = 60 + +[[dashboard.widgets]] +id = "nvme" +enabled = true + +[[dashboard.widgets]] +id = "services" +enabled = true + +[[dashboard.widgets]] +id = "backup" +enabled = true + +[[dashboard.widgets]] +id = "alerts" +enabled = true + +[filesystem] +# cache_dir = "/var/lib/cm-dashboard/cache" +# history_dir = "/var/lib/cm-dashboard/history" diff --git a/dashboard/config/hosts.toml b/dashboard/config/hosts.toml new file mode 100644 index 0000000..089c02b --- /dev/null +++ b/dashboard/config/hosts.toml @@ -0,0 +1,12 @@ +# Optional separate hosts configuration + +[hosts] +# default_host = "srv01" + +[[hosts.hosts]] +name = "srv01" +enabled = true + +[[hosts.hosts]] +name = "labbox" +enabled = true diff --git a/dashboard/src/app.rs b/dashboard/src/app.rs index 247ce64..78bf2b0 100644 --- a/dashboard/src/app.rs +++ b/dashboard/src/app.rs @@ -5,6 +5,7 @@ use std::time::{Duration, Instant}; use anyhow::Result; use chrono::{DateTime, Utc}; use crossterm::event::{KeyCode, KeyEvent, KeyEventKind}; +use gethostname::gethostname; use crate::config; use crate::data::config::{AppConfig, DataSourceKind, HostTarget, ZmqConfig}; @@ -100,8 +101,8 @@ impl App { let host_count = self.hosts.len(); let retention = self.history.retention(); self.status = format!( - "Monitoring • hosts: {} • ticks: {} • refresh: {:?} • retention: {:?}", - host_count, self.tick_count, self.options.tick_rate, retention + "Monitoring • hosts: {} • refresh: {:?} • retention: {:?}", + host_count, self.options.tick_rate, retention ); } @@ -321,49 +322,94 @@ impl App { } fn build_initial_status(host: Option<&String>, config_path: Option<&PathBuf>) -> String { - match (host, config_path) { - (Some(host), Some(path)) => { + let detected = Self::local_hostname(); + match (host, config_path, detected.as_ref()) { + (Some(host), Some(path), _) => { format!("Ready • host: {} • config: {}", host, path.display()) } - (Some(host), None) => format!("Ready • host: {}", host), - (None, Some(path)) => format!("Ready • config: {}", path.display()), - (None, None) => "Ready • no host selected".to_string(), + (Some(host), None, _) => format!("Ready • host: {}", host), + (None, Some(path), Some(local)) => format!( + "Ready • host: {} (auto) • config: {}", + local, + path.display() + ), + (None, Some(path), None) => format!("Ready • config: {}", path.display()), + (None, None, Some(local)) => format!("Ready • host: {} (auto)", local), + (None, None, None) => "Ready • no host selected".to_string(), } } fn select_hosts(host: Option<&String>, config: Option<&AppConfig>) -> Vec { let mut targets = Vec::new(); - let Some(config) = config else { - return targets; - }; + if let Some(filter) = host { + let normalized = filter.to_lowercase(); - let host_filter = host.map(|value| value.to_lowercase()); - - for entry in &config.hosts.hosts { - if !entry.enabled { - continue; - } - - if let Some(filter) = &host_filter { - if entry.name.to_lowercase() != *filter { - continue; + if let Some(config) = config { + if let Some(entry) = config.hosts.hosts.iter().find(|candidate| { + candidate.enabled && candidate.name.to_lowercase() == normalized + }) { + return vec![entry.clone()]; } } - targets.push(entry.clone()); + return vec![HostTarget::from_name(filter.clone())]; } - if targets.is_empty() { - if let Some(default_host) = &config.hosts.default_host { - if host_filter.is_none() { - if let Some(entry) = config.hosts.hosts.iter().find(|candidate| { - candidate.enabled && candidate.name.eq_ignore_ascii_case(default_host) - }) { - targets.push(entry.clone()); + let local_host = Self::local_hostname(); + + if let Some(config) = config { + if let Some(local) = local_host.as_ref() { + if let Some(entry) = config.hosts.hosts.iter().find(|candidate| { + candidate.enabled && candidate.name.eq_ignore_ascii_case(local) + }) { + targets.push(entry.clone()); + } else { + targets.push(HostTarget::from_name(local.clone())); + } + } + + for entry in &config.hosts.hosts { + if !entry.enabled { + continue; + } + + if targets + .iter() + .any(|existing| existing.name.eq_ignore_ascii_case(&entry.name)) + { + continue; + } + + targets.push(entry.clone()); + } + + if targets.len() <= 1 { + if let Some(default_host) = &config.hosts.default_host { + if !targets + .iter() + .any(|existing| existing.name.eq_ignore_ascii_case(default_host)) + { + if let Some(entry) = config.hosts.hosts.iter().find(|candidate| { + candidate.enabled && candidate.name.eq_ignore_ascii_case(default_host) + }) { + targets.push(entry.clone()); + } } } } + + if targets.is_empty() { + if let Some(local) = local_host { + targets.push(HostTarget::from_name(local)); + } + } + } else if let Some(local) = local_host { + targets.push(HostTarget::from_name(local)); + } + + if targets.is_empty() { + targets.push(HostTarget::from_name("localhost".to_string())); } targets @@ -437,6 +483,18 @@ impl App { } } +impl App { + fn local_hostname() -> Option { + let raw = gethostname(); + let value = raw.to_string_lossy().trim().to_string(); + if value.is_empty() { + None + } else { + Some(value) + } + } +} + #[derive(Debug, Clone)] pub struct HostDisplayData { pub name: String, diff --git a/dashboard/src/data/metrics.rs b/dashboard/src/data/metrics.rs index 165ab45..09bd25b 100644 --- a/dashboard/src/data/metrics.rs +++ b/dashboard/src/data/metrics.rs @@ -19,6 +19,8 @@ pub struct DriveInfo { pub wear_level: f32, pub power_on_hours: u64, pub available_spare: f32, + pub capacity_gb: Option, + pub used_gb: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -44,6 +46,28 @@ pub struct ServiceSummary { pub failed: usize, pub memory_used_mb: f32, pub memory_quota_mb: f32, + #[serde(default)] + pub system_memory_used_mb: f32, + #[serde(default)] + pub system_memory_total_mb: f32, + #[serde(default)] + pub disk_used_gb: f32, + #[serde(default)] + pub disk_total_gb: f32, + #[serde(default)] + pub cpu_load_1: f32, + #[serde(default)] + pub cpu_load_5: f32, + #[serde(default)] + pub cpu_load_15: f32, + #[serde(default)] + pub cpu_freq_mhz: Option, + #[serde(default)] + pub cpu_temp_c: Option, + #[serde(default)] + pub gpu_load_percent: Option, + #[serde(default)] + pub gpu_temp_c: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -54,6 +78,8 @@ pub struct ServiceInfo { pub memory_quota_mb: f32, pub cpu_percent: f32, pub sandbox_limit: Option, + #[serde(default)] + pub disk_used_gb: f32, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/dashboard/src/main.rs b/dashboard/src/main.rs index d32b6e4..ccb8e2b 100644 --- a/dashboard/src/main.rs +++ b/dashboard/src/main.rs @@ -6,7 +6,10 @@ mod ui; use std::fs; use std::io::{self, Stdout}; use std::path::{Path, PathBuf}; -use std::sync::OnceLock; +use std::sync::{ + atomic::{AtomicBool, Ordering}, + Arc, OnceLock, +}; use std::time::Duration; use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics}; @@ -100,8 +103,14 @@ async fn main() -> Result<()> { let mut app = App::new(options)?; let (event_tx, mut event_rx) = unbounded_channel(); + let shutdown_flag = Arc::new(AtomicBool::new(false)); + let zmq_task = if let Some(context) = app.zmq_context() { - Some(spawn_metrics_task(context, event_tx.clone())) + Some(spawn_metrics_task( + context, + event_tx.clone(), + shutdown_flag.clone(), + )) } else { None }; @@ -109,9 +118,12 @@ async fn main() -> Result<()> { let mut terminal = setup_terminal()?; let result = run_app(&mut terminal, &mut app, &mut event_rx); teardown_terminal(terminal)?; + shutdown_flag.store(true, Ordering::Relaxed); let _ = event_tx.send(AppEvent::Shutdown); if let Some(handle) = zmq_task { - handle.abort(); + if let Err(join_error) = handle.await { + warn!(%join_error, "ZMQ metrics task ended unexpectedly"); + } } result } @@ -206,9 +218,13 @@ fn prepare_log_writer() -> Result { Ok(non_blocking) } -fn spawn_metrics_task(context: ZmqContext, sender: UnboundedSender) -> JoinHandle<()> { +fn spawn_metrics_task( + context: ZmqContext, + sender: UnboundedSender, + shutdown: Arc, +) -> JoinHandle<()> { tokio::spawn(async move { - match spawn_blocking(move || metrics_blocking_loop(context, sender)).await { + match spawn_blocking(move || metrics_blocking_loop(context, sender, shutdown)).await { Ok(Ok(())) => {} Ok(Err(error)) => warn!(%error, "ZMQ metrics worker exited with error"), Err(join_error) => warn!(%join_error, "ZMQ metrics worker panicked"), @@ -216,12 +232,23 @@ fn spawn_metrics_task(context: ZmqContext, sender: UnboundedSender) -> }) } -fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender) -> Result<()> { +fn metrics_blocking_loop( + context: ZmqContext, + sender: UnboundedSender, + shutdown: Arc, +) -> Result<()> { let zmq_context = NativeZmqContext::new(); let socket = zmq_context .socket(zmq::SUB) .context("failed to create ZMQ SUB socket")?; + socket + .set_linger(0) + .context("failed to configure ZMQ linger")?; + socket + .set_rcvtimeo(1_000) + .context("failed to configure ZMQ receive timeout")?; + for endpoint in context.endpoints() { debug!(%endpoint, "connecting to ZMQ endpoint"); socket @@ -239,7 +266,7 @@ fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender) .context("failed to subscribe to all ZMQ topics")?; } - loop { + while !shutdown.load(Ordering::Relaxed) { match socket.recv_msg(0) { Ok(message) => { if let Err(error) = handle_zmq_message(&message, &sender) { @@ -247,11 +274,18 @@ fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender) } } Err(error) => { + if error == zmq::Error::EAGAIN { + continue; + } warn!(%error, "ZMQ receive error"); - std::thread::sleep(Duration::from_secs(1)); + std::thread::sleep(Duration::from_millis(250)); } } } + + debug!("ZMQ metrics worker shutting down"); + + Ok(()) } fn handle_zmq_message( @@ -442,7 +476,7 @@ tick_rate_ms = 250 history_duration_minutes = 60 [[dashboard.widgets]] -id = "nvme" +id = "storage" enabled = true [[dashboard.widgets]] diff --git a/dashboard/src/ui/alerts.rs b/dashboard/src/ui/alerts.rs index 1a00f6c..f1603b4 100644 --- a/dashboard/src/ui/alerts.rs +++ b/dashboard/src/ui/alerts.rs @@ -1,51 +1,299 @@ -use ratatui::layout::Rect; +use chrono::{DateTime, Utc}; +use ratatui::layout::{Constraint, Rect}; use ratatui::style::{Color, Modifier, Style}; -use ratatui::text::{Line, Span}; -use ratatui::widgets::{Block, Borders, Paragraph, Wrap}; +use ratatui::text::Span; +use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap}; use ratatui::Frame; use crate::app::HostDisplayData; +use crate::ui::memory::{evaluate_performance, PerfSeverity}; pub fn render(frame: &mut Frame, hosts: &[HostDisplayData], area: Rect) { - let block = Block::default() - .title("Alerts") - .borders(Borders::ALL) - .style(Style::default().fg(Color::LightRed)); + let (severity, ok_count, warn_count, fail_count) = classify_hosts(hosts); + let color = match severity { + AlertSeverity::Critical => Color::Red, + AlertSeverity::Warning => Color::Yellow, + AlertSeverity::Healthy => Color::Green, + AlertSeverity::Unknown => Color::LightCyan, + }; - let mut lines = Vec::new(); + let title = format!( + "Alerts • ok:{} warn:{} fail:{}", + ok_count, warn_count, fail_count + ); + + let block = Block::default() + .title(Span::styled( + title, + Style::default().fg(color).add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL) + .border_style(Style::default().fg(color)) + .style(Style::default().fg(Color::White)); + + let inner = block.inner(area); + frame.render_widget(block, area); if hosts.is_empty() { - lines.push(Line::from("No hosts configured")); - } else { - for host in hosts { - if let Some(error) = &host.last_error { - lines.push(Line::from(vec![ - Span::styled(&host.name, Style::default().add_modifier(Modifier::BOLD)), - Span::raw(": "), - Span::styled(error, Style::default().fg(Color::Red)), - ])); - continue; - } + frame.render_widget( + Paragraph::new("No hosts configured") + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + inner, + ); + return; + } - if let Some(smart) = host.smart.as_ref() { - if let Some(issue) = smart.issues.first() { - lines.push(Line::from(vec![ - Span::styled(&host.name, Style::default().add_modifier(Modifier::BOLD)), - Span::raw(": "), - Span::styled(issue, Style::default().fg(Color::Yellow)), - ])); - continue; - } - } + let header = Row::new(vec![ + Cell::from("Host"), + Cell::from("Status"), + Cell::from("Timestamp"), + ]) + .style( + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ); - lines.push(Line::from(vec![ - Span::styled(&host.name, Style::default().add_modifier(Modifier::BOLD)), - Span::raw(": OK"), - ])); + let rows = hosts.iter().map(|host| { + let (status, severity, emphasize) = host_status(host); + let row_style = severity_style(severity); + let update = latest_timestamp(host) + .map(|ts| ts.format("%Y-%m-%d %H:%M:%S").to_string()) + .unwrap_or_else(|| "—".to_string()); + + let status_cell = if emphasize { + Cell::from(Span::styled( + status.clone(), + Style::default().add_modifier(Modifier::BOLD), + )) + } else { + Cell::from(status.clone()) + }; + + Row::new(vec![ + Cell::from(host.name.clone()), + status_cell, + Cell::from(update), + ]) + .style(row_style) + }); + + let table = Table::new(rows) + .header(header) + .style(Style::default().fg(Color::White)) + .widths(&[ + Constraint::Percentage(20), + Constraint::Length(20), + Constraint::Min(24), + ]) + .column_spacing(2); + + frame.render_widget(table, inner); +} + +#[derive(Copy, Clone, Eq, PartialEq)] +enum AlertSeverity { + Healthy, + Warning, + Critical, + Unknown, +} + +fn classify_hosts(hosts: &[HostDisplayData]) -> (AlertSeverity, usize, usize, usize) { + let mut ok = 0; + let mut warn = 0; + let mut fail = 0; + + for host in hosts { + let severity = host_severity(host); + match severity { + AlertSeverity::Healthy => ok += 1, + AlertSeverity::Warning => warn += 1, + AlertSeverity::Critical => fail += 1, + AlertSeverity::Unknown => warn += 1, } } - let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block); + let highest = if fail > 0 { + AlertSeverity::Critical + } else if warn > 0 { + AlertSeverity::Warning + } else if ok > 0 { + AlertSeverity::Healthy + } else { + AlertSeverity::Unknown + }; - frame.render_widget(paragraph, area); + (highest, ok, warn, fail) +} + +fn host_severity(host: &HostDisplayData) -> AlertSeverity { + if host.last_error.is_some() { + return AlertSeverity::Critical; + } + + if let Some(smart) = host.smart.as_ref() { + if smart.summary.critical > 0 { + return AlertSeverity::Critical; + } + if smart.summary.warning > 0 || !smart.issues.is_empty() { + return AlertSeverity::Warning; + } + } + + if let Some(services) = host.services.as_ref() { + if services.summary.failed > 0 { + return AlertSeverity::Critical; + } + if services.summary.degraded > 0 { + return AlertSeverity::Warning; + } + + let (perf_severity, _) = evaluate_performance(&services.summary); + match perf_severity { + PerfSeverity::Critical => return AlertSeverity::Critical, + PerfSeverity::Warning => return AlertSeverity::Warning, + PerfSeverity::Ok => {} + } + } + + if let Some(backup) = host.backup.as_ref() { + match backup.overall_status { + crate::data::metrics::BackupStatus::Failed => return AlertSeverity::Critical, + crate::data::metrics::BackupStatus::Warning => return AlertSeverity::Warning, + _ => {} + } + } + + if host.smart.is_none() && host.services.is_none() && host.backup.is_none() { + AlertSeverity::Unknown + } else { + AlertSeverity::Healthy + } +} + +fn host_status(host: &HostDisplayData) -> (String, AlertSeverity, bool) { + if let Some(error) = &host.last_error { + return (format!("error: {}", error), AlertSeverity::Critical, true); + } + + if let Some(smart) = host.smart.as_ref() { + if smart.summary.critical > 0 { + return ( + "critical: SMART critical".to_string(), + AlertSeverity::Critical, + true, + ); + } + if let Some(issue) = smart.issues.first() { + return (format!("warning: {}", issue), AlertSeverity::Warning, true); + } + } + + if let Some(services) = host.services.as_ref() { + if services.summary.failed > 0 { + return ( + format!("critical: {} failed svc", services.summary.failed), + AlertSeverity::Critical, + true, + ); + } + if services.summary.degraded > 0 { + return ( + format!("warning: {} degraded svc", services.summary.degraded), + AlertSeverity::Warning, + true, + ); + } + + let (perf_severity, reason) = evaluate_performance(&services.summary); + if let Some(reason_text) = reason { + match perf_severity { + PerfSeverity::Critical => { + return ( + format!("critical: {}", reason_text), + AlertSeverity::Critical, + true, + ); + } + PerfSeverity::Warning => { + return ( + format!("warning: {}", reason_text), + AlertSeverity::Warning, + true, + ); + } + PerfSeverity::Ok => {} + } + } + } + + if let Some(backup) = host.backup.as_ref() { + match backup.overall_status { + crate::data::metrics::BackupStatus::Failed => { + return ( + "critical: backup failed".to_string(), + AlertSeverity::Critical, + true, + ); + } + crate::data::metrics::BackupStatus::Warning => { + return ( + "warning: backup warning".to_string(), + AlertSeverity::Warning, + true, + ); + } + _ => {} + } + } + + if host.smart.is_none() && host.services.is_none() && host.backup.is_none() { + let status = if host.last_success.is_none() { + "pending: awaiting metrics" + } else { + "pending: no recent data" + }; + + return (status.to_string(), AlertSeverity::Warning, false); + } + + ("ok".to_string(), AlertSeverity::Healthy, false) +} + +fn severity_style(severity: AlertSeverity) -> Style { + match severity { + AlertSeverity::Critical => Style::default().fg(Color::Red), + AlertSeverity::Warning => Style::default().fg(Color::Yellow), + AlertSeverity::Healthy => Style::default().fg(Color::White), + AlertSeverity::Unknown => Style::default().fg(Color::LightCyan), + } +} + +fn latest_timestamp(host: &HostDisplayData) -> Option> { + let mut latest = host.last_success; + + if let Some(smart) = host.smart.as_ref() { + latest = Some(match latest { + Some(current) => current.max(smart.timestamp), + None => smart.timestamp, + }); + } + + if let Some(services) = host.services.as_ref() { + latest = Some(match latest { + Some(current) => current.max(services.timestamp), + None => services.timestamp, + }); + } + + if let Some(backup) = host.backup.as_ref() { + latest = Some(match latest { + Some(current) => current.max(backup.timestamp), + None => backup.timestamp, + }); + } + + latest } diff --git a/dashboard/src/ui/backup.rs b/dashboard/src/ui/backup.rs index 365e3e3..593f563 100644 --- a/dashboard/src/ui/backup.rs +++ b/dashboard/src/ui/backup.rs @@ -1,62 +1,166 @@ -use ratatui::layout::Rect; +use ratatui::layout::{Constraint, Direction, Layout, Rect}; use ratatui::style::{Color, Modifier, Style}; use ratatui::text::{Line, Span}; -use ratatui::widgets::{Block, Borders, Paragraph, Wrap}; +use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap}; use ratatui::Frame; use crate::app::HostDisplayData; +use crate::data::metrics::{BackupMetrics, BackupStatus}; pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { - let block = Block::default() - .title("Backups") - .borders(Borders::ALL) - .style(Style::default().fg(Color::LightGreen)); - - let mut lines = Vec::new(); - match host { Some(data) => { if let Some(metrics) = data.backup.as_ref() { - lines.push(Line::from(vec![ - Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)), - Span::raw(data.name.clone()), - ])); - lines.push(Line::from(format!("Status: {:?}", metrics.overall_status))); - - if let Some(last_success) = metrics.backup.last_success.as_ref() { - lines.push(Line::from(format!( - "Last success: {}", - last_success.format("%Y-%m-%d %H:%M:%S") - ))); - } - - if let Some(last_failure) = metrics.backup.last_failure.as_ref() { - lines.push(Line::from(vec![ - Span::styled("Last failure: ", Style::default().fg(Color::Red)), - Span::raw(last_failure.format("%Y-%m-%d %H:%M:%S").to_string()), - ])); - } - - lines.push(Line::from(format!( - "Snapshots: {} • Size: {:.1} GiB", - metrics.backup.snapshot_count, metrics.backup.size_gb - ))); - - lines.push(Line::from(format!( - "Pending jobs: {} (enabled: {})", - metrics.service.pending_jobs, metrics.service.enabled - ))); + render_metrics(frame, data, metrics, area); } else { - lines.push(Line::from(format!( - "Host {} awaiting backup metrics", - data.name - ))); + render_placeholder( + frame, + area, + &format!("Host {} awaiting backup metrics", data.name), + ); } } - None => lines.push(Line::from("No hosts configured")), + None => render_placeholder(frame, area, "No hosts configured"), + } +} + +fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &BackupMetrics, area: Rect) { + let color = backup_status_color(&metrics.overall_status); + let title = format!("Backups • status: {:?}", metrics.overall_status); + + let block = Block::default() + .title(Span::styled( + title, + Style::default().fg(color).add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL) + .border_style(Style::default().fg(color)) + .style(Style::default().fg(Color::White)); + + let inner = block.inner(area); + frame.render_widget(block, area); + + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Length(2), Constraint::Min(1)]) + .split(inner); + + let summary_line = Line::from(vec![ + Span::styled("Snapshots: ", Style::default().add_modifier(Modifier::BOLD)), + Span::raw(metrics.backup.snapshot_count.to_string()), + Span::raw(" • Size: "), + Span::raw(format!("{:.1} GiB", metrics.backup.size_gb)), + Span::raw(" • Last success: "), + Span::raw(format_timestamp(metrics.backup.last_success.as_ref())), + ]); + + frame.render_widget( + Paragraph::new(summary_line) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + chunks[0], + ); + + let header = Row::new(vec![Cell::from("Aspect"), Cell::from("Details")]).style( + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ); + + let mut rows = Vec::new(); + rows.push( + Row::new(vec![ + Cell::from("Repo"), + Cell::from(format!( + "Snapshots: {} • Size: {:.1} GiB", + metrics.backup.snapshot_count, metrics.backup.size_gb + )), + ]) + .style(Style::default().fg(Color::White)), + ); + + rows.push( + Row::new(vec![ + Cell::from("Service"), + Cell::from(format!( + "Enabled: {} • Pending jobs: {}", + metrics.service.enabled, metrics.service.pending_jobs + )), + ]) + .style(backup_severity_style(&metrics.overall_status)), + ); + + if let Some(last_failure) = metrics.backup.last_failure.as_ref() { + rows.push( + Row::new(vec![ + Cell::from("Last failure"), + Cell::from(last_failure.format("%Y-%m-%d %H:%M:%S").to_string()), + ]) + .style(Style::default().fg(Color::Red)), + ); } - let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block); + if let Some(message) = metrics.service.last_message.as_ref() { + let message_style = match metrics.overall_status { + BackupStatus::Failed => Style::default().fg(Color::Red), + BackupStatus::Warning => Style::default().fg(Color::Yellow), + _ => Style::default().fg(Color::White), + }; - frame.render_widget(paragraph, area); + rows.push( + Row::new(vec![ + Cell::from("Last message"), + Cell::from(message.clone()), + ]) + .style(message_style), + ); + } + + let table = Table::new(rows) + .header(header) + .style(Style::default().fg(Color::White)) + .widths(&[Constraint::Length(13), Constraint::Min(10)]) + .column_spacing(2); + + frame.render_widget(table, chunks[1]); +} + +fn backup_status_color(status: &BackupStatus) -> Color { + match status { + BackupStatus::Failed => Color::Red, + BackupStatus::Warning => Color::Yellow, + BackupStatus::Unknown => Color::LightYellow, + BackupStatus::Healthy => Color::Green, + } +} + +fn format_timestamp(timestamp: Option<&chrono::DateTime>) -> String { + timestamp + .map(|ts| ts.format("%Y-%m-%d %H:%M:%S").to_string()) + .unwrap_or_else(|| "—".to_string()) +} + +fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) { + let block = Block::default() + .title("Backups") + .borders(Borders::ALL) + .border_style(Style::default().fg(Color::LightGreen)) + .style(Style::default().fg(Color::White)); + let inner = block.inner(area); + frame.render_widget(block, area); + frame.render_widget( + Paragraph::new(Line::from(message)) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + inner, + ); +} + +fn backup_severity_style(status: &BackupStatus) -> Style { + match status { + BackupStatus::Failed => Style::default().fg(Color::Red), + BackupStatus::Warning => Style::default().fg(Color::Yellow), + BackupStatus::Unknown => Style::default().fg(Color::LightCyan), + BackupStatus::Healthy => Style::default().fg(Color::White), + } } diff --git a/dashboard/src/ui/dashboard.rs b/dashboard/src/ui/dashboard.rs index e0bdd38..6ee4775 100644 --- a/dashboard/src/ui/dashboard.rs +++ b/dashboard/src/ui/dashboard.rs @@ -1,19 +1,25 @@ use ratatui::layout::{Constraint, Direction, Layout, Rect}; use ratatui::style::{Color, Modifier, Style}; use ratatui::text::Span; -use ratatui::widgets::Block; +use ratatui::widgets::{Block, Cell, Row, Table}; use ratatui::Frame; use crate::app::App; -use super::{alerts, backup, memory, nvme, services}; +use super::{alerts, backup, memory, storage, services}; pub fn render(frame: &mut Frame, app: &App) { let host_summaries = app.host_display_data(); let primary_host = app.active_host_display(); + let title = if let Some(host) = primary_host.as_ref() { + format!("CM Dashboard • {}", host.name) + } else { + "CM Dashboard".to_string() + }; + let root_block = Block::default().title(Span::styled( - "CM Dashboard", + title, Style::default() .fg(Color::Cyan) .add_modifier(Modifier::BOLD), @@ -48,7 +54,7 @@ pub fn render(frame: &mut Frame, app: &App) { .constraints([Constraint::Percentage(50), Constraint::Percentage(50)]) .split(vertical_chunks[2]); - nvme::render(frame, primary_host.as_ref(), top[0]); + storage::render(frame, primary_host.as_ref(), top[0]); services::render(frame, primary_host.as_ref(), top[1]); memory::render(frame, primary_host.as_ref(), middle[0]); backup::render(frame, primary_host.as_ref(), middle[1]); @@ -61,72 +67,125 @@ pub fn render(frame: &mut Frame, app: &App) { } fn render_status(frame: &mut Frame, app: &App, area: Rect) { - use ratatui::text::Line; - use ratatui::widgets::{Paragraph, Wrap}; - - let mut lines = Vec::new(); - lines.push(Line::from(app.status_text().to_string())); - - if app.zmq_connected() { - lines.push(Line::from(vec![ - Span::styled( - "Data source: ", - Style::default().add_modifier(Modifier::BOLD), - ), - Span::styled("ZMQ", Style::default().fg(Color::Green)), - ])); + let connected = app.zmq_connected(); + let title_color = if connected { Color::Green } else { Color::Red }; + let title_suffix = if connected { + "connected" } else { - lines.push(Line::from(vec![ - Span::styled( - "Data source: ", - Style::default().add_modifier(Modifier::BOLD), - ), - Span::styled("ZMQ (disconnected)", Style::default().fg(Color::Red)), - ])); - } + "disconnected" + }; + + let block = Block::default() + .title(Span::styled( + format!("Status • ZMQ {title_suffix}"), + Style::default() + .fg(title_color) + .add_modifier(Modifier::BOLD), + )) + .borders(ratatui::widgets::Borders::ALL) + .border_style(Style::default().fg(title_color)) + .style(Style::default().fg(Color::White)); + + let inner = block.inner(area); + frame.render_widget(block, area); + + let mut rows: Vec = Vec::new(); + + let status_style = if connected { + Style::default().fg(Color::White) + } else { + Style::default().fg(Color::Red) + }; + + let default_style = Style::default().fg(Color::White); + + rows.push( + Row::new(vec![ + Cell::from("Status"), + Cell::from(app.status_text().to_string()), + ]) + .style(status_style), + ); + + rows.push( + Row::new(vec![ + Cell::from("Data source"), + Cell::from(if connected { + "ZMQ – connected" + } else { + "ZMQ – disconnected" + }), + ]) + .style(status_style), + ); if let Some((index, host)) = app.active_host_info() { - lines.push(Line::from(format!( - "Active host: {} ({}/{})", - host.name, - index + 1, - app.hosts().len() - ))); + let mut detail = format!("{} ({}/{})", host.name, index + 1, app.hosts().len()); + if let Some(state) = app + .host_display_data() + .into_iter() + .find(|entry| entry.name == host.name) + { + if let Some(last_success) = state.last_success { + detail = format!( + "{} • last success {}", + detail, + last_success.format("%H:%M:%S") + ); + } + } + rows.push( + Row::new(vec![Cell::from("Active host"), Cell::from(detail)]).style(default_style), + ); } else { - lines.push(Line::from("Active host: —")); + rows.push(Row::new(vec![Cell::from("Active host"), Cell::from("—")]).style(default_style)); } if let Some(path) = app.active_config_path() { - lines.push(Line::from(vec![ - Span::styled("Config: ", Style::default().add_modifier(Modifier::BOLD)), - Span::raw(path.display().to_string()), - ])); + rows.push( + Row::new(vec![ + Cell::from("Config"), + Cell::from(path.display().to_string()), + ]) + .style(default_style), + ); } let retention = app.history().retention(); - lines.push(Line::from(format!( - "History retention ≈ {}s", - retention.as_secs() - ))); + rows.push( + Row::new(vec![ + Cell::from("History"), + Cell::from(format!("{} seconds", retention.as_secs())), + ]) + .style(default_style), + ); if let Some(config) = app.config() { if let Some(default_host) = &config.hosts.default_host { - lines.push(Line::from(format!("Default host: {}", default_host))); + rows.push( + Row::new(vec![ + Cell::from("Default host"), + Cell::from(default_host.clone()), + ]) + .style(default_style), + ); } } - let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block( - Block::default() - .title(Span::styled( - "Status", - Style::default() - .fg(Color::Green) - .add_modifier(Modifier::BOLD), - )) - .borders(ratatui::widgets::Borders::ALL), + rows.push( + Row::new(vec![ + Cell::from("Monitored hosts"), + Cell::from(app.hosts().len().to_string()), + ]) + .style(default_style), ); - frame.render_widget(paragraph, area); + let table = Table::new(rows) + .widths(&[Constraint::Length(18), Constraint::Min(24)]) + .column_spacing(2) + .style(default_style); + + frame.render_widget(table, inner); } fn inner_rect(area: Rect) -> Rect { diff --git a/dashboard/src/ui/memory.rs b/dashboard/src/ui/memory.rs index a5e3772..b53f0e1 100644 --- a/dashboard/src/ui/memory.rs +++ b/dashboard/src/ui/memory.rs @@ -5,52 +5,277 @@ use ratatui::widgets::{Block, Borders, Paragraph, Wrap}; use ratatui::Frame; use crate::app::HostDisplayData; +use crate::data::metrics::{ServiceMetrics, ServiceSummary}; pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { - let block = Block::default() - .title("Memory Optimization") - .borders(Borders::ALL) - .style(Style::default().fg(Color::LightMagenta)); - - let mut lines = Vec::new(); - match host { Some(data) => { if let Some(metrics) = data.services.as_ref() { - let summary = &metrics.summary; - let usage_ratio = if summary.memory_quota_mb > 0.0 { - (summary.memory_used_mb / summary.memory_quota_mb) * 100.0 - } else { - 0.0 - }; - - lines.push(Line::from(vec![ - Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)), - Span::raw(data.name.clone()), - ])); - - lines.push(Line::from(format!( - "Memory used: {:.1} / {:.1} MiB ({:.1}%)", - summary.memory_used_mb, summary.memory_quota_mb, usage_ratio - ))); - - if let Some(last_success) = data.last_success.as_ref() { - lines.push(Line::from(format!( - "Last update: {}", - last_success.format("%H:%M:%S") - ))); - } + render_metrics(frame, data, metrics, area); } else { - lines.push(Line::from(format!( - "Host {} awaiting service metrics", - data.name - ))); + render_placeholder( + frame, + area, + &format!("Host {} awaiting service metrics", data.name), + ); } } - None => lines.push(Line::from("No hosts configured")), + None => render_placeholder(frame, area, "No hosts configured"), + } +} + +fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &ServiceMetrics, area: Rect) { + let summary = &metrics.summary; + let system_total = if summary.system_memory_total_mb > 0.0 { + summary.system_memory_total_mb + } else { + summary.memory_quota_mb + }; + + let system_used = if summary.system_memory_used_mb > 0.0 { + summary.system_memory_used_mb + } else { + summary.memory_used_mb + }; + + let usage_ratio = if system_total > 0.0 { + (system_used / system_total) * 100.0 + } else { + 0.0 + }; + + let (perf_severity, _reason) = evaluate_performance(summary); + let (color, severity_label) = match perf_severity { + PerfSeverity::Critical => (Color::Red, "crit"), + PerfSeverity::Warning => (Color::Yellow, "warn"), + PerfSeverity::Ok => (Color::Green, "ok"), + }; + + let title = format!("CPU / Memory • {}", severity_label); + + let block = Block::default() + .title(Span::styled( + title, + Style::default().fg(color).add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL) + .border_style(Style::default().fg(color)) + .style(Style::default().fg(Color::White)); + + let inner = block.inner(area); + frame.render_widget(block, area); + + let mut lines = Vec::new(); + + // Check if memory should be highlighted due to alert + let memory_color = if usage_ratio >= 95.0 { + Color::Red // Critical + } else if usage_ratio >= 80.0 { + Color::Yellow // Warning + } else { + Color::White // Normal + }; + + lines.push(Line::from(vec![ + Span::styled( + format!("System memory: {:.1} / {:.1} MiB ({:.1}%)", + system_used, system_total, usage_ratio), + Style::default().fg(memory_color) + ) + ])); + + // Check if CPU load should be highlighted due to alert + let cpu_load_color = if summary.cpu_load_5 >= 4.0 { + Color::Red // Critical + } else if summary.cpu_load_5 >= 2.0 { + Color::Yellow // Warning + } else { + Color::White // Normal + }; + + lines.push(Line::from(vec![ + Span::styled( + format!("CPU load (1/5/15): {:.2} {:.2} {:.2}", + summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15), + Style::default().fg(cpu_load_color) + ) + ])); + + lines.push(Line::from(vec![ + Span::raw("CPU freq: "), + Span::raw(format_optional_metric(summary.cpu_freq_mhz, " MHz")), + ])); + + // Check if CPU temp should be highlighted due to alert + let cpu_temp_color = if let Some(temp) = summary.cpu_temp_c { + if temp >= 90.0 { + Color::Red // Critical + } else if temp >= 80.0 { + Color::Yellow // Warning + } else { + Color::White // Normal + } + } else { + Color::White // Normal + }; + + lines.push(Line::from(vec![ + Span::raw("CPU temp: "), + Span::styled( + format_optional_metric(summary.cpu_temp_c, "°C"), + Style::default().fg(cpu_temp_color) + ), + ])); + + if summary.gpu_load_percent.is_some() || summary.gpu_temp_c.is_some() { + // Check if GPU load should be highlighted due to alert + let gpu_load_color = if let Some(load) = summary.gpu_load_percent { + if load >= 95.0 { + Color::Red // Critical + } else if load >= 85.0 { + Color::Yellow // Warning + } else { + Color::White // Normal + } + } else { + Color::White // Normal + }; + + lines.push(Line::from(vec![ + Span::styled("GPU load: ", Style::default().add_modifier(Modifier::BOLD)), + Span::styled( + format_optional_percent(summary.gpu_load_percent), + Style::default().fg(gpu_load_color) + ), + ])); + + // Check if GPU temp should be highlighted due to alert + let gpu_temp_color = if let Some(temp) = summary.gpu_temp_c { + if temp >= 85.0 { + Color::Red // Critical + } else if temp >= 75.0 { + Color::Yellow // Warning + } else { + Color::White // Normal + } + } else { + Color::White // Normal + }; + + lines.push(Line::from(vec![ + Span::styled("GPU temp: ", Style::default().add_modifier(Modifier::BOLD)), + Span::styled( + format_optional_metric(summary.gpu_temp_c, "°C"), + Style::default().fg(gpu_temp_color) + ), + ])); } - let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block); - frame.render_widget(paragraph, area); + frame.render_widget( + Paragraph::new(lines) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + inner, + ); +} + +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) enum PerfSeverity { + Ok, + Warning, + Critical, +} + +fn format_optional_metric(value: Option, unit: &str) -> String { + match value { + Some(number) => format!("{:.1}{}", number, unit), + None => "—".to_string(), + } +} + +fn format_optional_percent(value: Option) -> String { + match value { + Some(number) => format!("{:.0}%", number), + None => "—".to_string(), + } +} + +fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) { + let block = Block::default() + .title("CPU / Memory") + .borders(Borders::ALL) + .border_style(Style::default().fg(Color::LightMagenta)) + .style(Style::default().fg(Color::White)); + let inner = block.inner(area); + frame.render_widget(block, area); + frame.render_widget( + Paragraph::new(Line::from(message)) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + inner, + ); +} + +pub(crate) fn evaluate_performance(summary: &ServiceSummary) -> (PerfSeverity, Option) { + let mem_percent = if summary.system_memory_total_mb > 0.0 { + (summary.system_memory_used_mb / summary.system_memory_total_mb) * 100.0 + } else if summary.memory_quota_mb > 0.0 { + (summary.memory_used_mb / summary.memory_quota_mb) * 100.0 + } else { + 0.0 + }; + + let mut severity = PerfSeverity::Ok; + let mut reason: Option = None; + + let mut consider = |level: PerfSeverity, message: String| { + if level > severity { + severity = level; + reason = Some(message); + } + }; + + if mem_percent >= 95.0 { + consider(PerfSeverity::Critical, format!("RAM {:.0}%", mem_percent)); + } else if mem_percent >= 80.0 { + consider(PerfSeverity::Warning, format!("RAM {:.0}%", mem_percent)); + } + + let load = summary.cpu_load_5; + if load >= 4.0 { + consider(PerfSeverity::Critical, format!("CPU load {:.2}", load)); + } else if load >= 2.0 { + consider(PerfSeverity::Warning, format!("CPU load {:.2}", load)); + } + + if let Some(temp) = summary.cpu_temp_c { + if temp >= 90.0 { + consider(PerfSeverity::Critical, format!("CPU temp {:.0}°C", temp)); + } else if temp >= 80.0 { + consider(PerfSeverity::Warning, format!("CPU temp {:.0}°C", temp)); + } + } + + if let Some(load) = summary.gpu_load_percent { + if load >= 95.0 { + consider(PerfSeverity::Critical, format!("GPU load {:.0}%", load)); + } else if load >= 85.0 { + consider(PerfSeverity::Warning, format!("GPU load {:.0}%", load)); + } + } + + if let Some(temp) = summary.gpu_temp_c { + if temp >= 85.0 { + consider(PerfSeverity::Critical, format!("GPU temp {:.0}°C", temp)); + } else if temp >= 75.0 { + consider(PerfSeverity::Warning, format!("GPU temp {:.0}°C", temp)); + } + } + + if severity == PerfSeverity::Ok { + (PerfSeverity::Ok, None) + } else { + (severity, reason) + } } diff --git a/dashboard/src/ui/mod.rs b/dashboard/src/ui/mod.rs index 16be3a9..23427bc 100644 --- a/dashboard/src/ui/mod.rs +++ b/dashboard/src/ui/mod.rs @@ -2,7 +2,7 @@ pub mod alerts; pub mod backup; pub mod dashboard; pub mod memory; -pub mod nvme; +pub mod storage; pub mod services; pub use dashboard::render; diff --git a/dashboard/src/ui/nvme.rs b/dashboard/src/ui/nvme.rs deleted file mode 100644 index 30aa268..0000000 --- a/dashboard/src/ui/nvme.rs +++ /dev/null @@ -1,58 +0,0 @@ -use ratatui::layout::Rect; -use ratatui::style::{Color, Modifier, Style}; -use ratatui::text::{Line, Span}; -use ratatui::widgets::{Block, Borders, Paragraph, Wrap}; -use ratatui::Frame; - -use crate::app::HostDisplayData; - -pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { - let block = Block::default() - .title("NVMe Health") - .borders(Borders::ALL) - .style(Style::default().fg(Color::LightCyan)); - - let mut lines = Vec::new(); - - match host { - Some(data) => { - if let Some(metrics) = data.smart.as_ref() { - lines.push(Line::from(vec![ - Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)), - Span::raw(data.name.clone()), - ])); - lines.push(Line::from(vec![ - Span::styled("Status: ", Style::default().add_modifier(Modifier::BOLD)), - Span::raw(metrics.status.clone()), - ])); - lines.push(Line::from(format!( - "Drives healthy/warn/crit: {}/{}/{}", - metrics.summary.healthy, metrics.summary.warning, metrics.summary.critical - ))); - lines.push(Line::from(format!( - "Capacity used: {:.1} / {:.1} GiB", - metrics.summary.capacity_used_gb, metrics.summary.capacity_total_gb - ))); - - if let Some(issue) = metrics.issues.first() { - lines.push(Line::from(vec![ - Span::styled("Issue: ", Style::default().fg(Color::Yellow)), - Span::raw(issue.clone()), - ])); - } - } else { - lines.push(Line::from(format!( - "Host {} has no SMART data yet", - data.name - ))); - } - } - None => { - lines.push(Line::from("No hosts configured")); - } - } - - let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block); - - frame.render_widget(paragraph, area); -} diff --git a/dashboard/src/ui/services.rs b/dashboard/src/ui/services.rs index 9bcb71a..4e5a01f 100644 --- a/dashboard/src/ui/services.rs +++ b/dashboard/src/ui/services.rs @@ -1,54 +1,257 @@ -use ratatui::layout::Rect; +use ratatui::layout::{Constraint, Direction, Layout, Rect}; use ratatui::style::{Color, Modifier, Style}; use ratatui::text::{Line, Span}; -use ratatui::widgets::{Block, Borders, Paragraph, Wrap}; +use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap}; use ratatui::Frame; use crate::app::HostDisplayData; +use crate::data::metrics::{ServiceStatus, ServiceSummary}; pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { - let block = Block::default() - .title("Services") - .borders(Borders::ALL) - .style(Style::default().fg(Color::Yellow)); - - let mut lines = Vec::new(); - match host { Some(data) => { if let Some(metrics) = data.services.as_ref() { - let summary = &metrics.summary; - lines.push(Line::from(vec![ - Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)), - Span::raw(data.name.clone()), - ])); - lines.push(Line::from(format!( - "Services healthy/degraded/failed: {}/{}/{}", - summary.healthy, summary.degraded, summary.failed - ))); - lines.push(Line::from(format!( - "CPU top service: {:.1}%", - metrics - .services - .iter() - .map(|svc| svc.cpu_percent) - .fold(0.0_f32, f32::max) - ))); - lines.push(Line::from(format!( - "Total memory: {:.1} / {:.1} MiB", - summary.memory_used_mb, summary.memory_quota_mb - ))); + render_metrics(frame, data, metrics, area); } else { - lines.push(Line::from(format!( - "Host {} has no service metrics yet", - data.name - ))); + render_placeholder( + frame, + area, + &format!("Host {} has no service metrics yet", data.name), + ); } } - None => lines.push(Line::from("No hosts configured")), + None => render_placeholder(frame, area, "No hosts configured"), + } +} + +fn render_metrics( + frame: &mut Frame, + _host: &HostDisplayData, + metrics: &crate::data::metrics::ServiceMetrics, + area: Rect, +) { + let summary = &metrics.summary; + let color = summary_color(summary); + let disk_summary = format_disk_summary(summary.disk_used_gb, summary.disk_total_gb); + let title = format!( + "Services • ok:{} warn:{} fail:{} • Disk: {}", + summary.healthy, summary.degraded, summary.failed, disk_summary + ); + + let block = Block::default() + .title(Span::styled( + title, + Style::default().fg(color).add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL) + .border_style(Style::default().fg(color)) + .style(Style::default().fg(Color::White)); + + let inner = block.inner(area); + frame.render_widget(block, area); + + let chunks = Layout::default() + .direction(Direction::Vertical) + .constraints([Constraint::Length(2), Constraint::Min(1)]) + .split(inner); + + let mut summary_lines = Vec::new(); + summary_lines.push(Line::from(vec![ + Span::styled( + "Service memory: ", + Style::default().add_modifier(Modifier::BOLD), + ), + Span::raw(format_memory(summary)), + ])); + + let disk_text = if summary.disk_total_gb > 0.0 { + format!( + "{:.1} / {:.1} GiB", + summary.disk_used_gb, summary.disk_total_gb + ) + } else { + "—".to_string() + }; + + summary_lines.push(Line::from(vec![ + Span::styled( + "Disk usage: ", + Style::default().add_modifier(Modifier::BOLD), + ), + Span::raw(disk_text), + ])); + + summary_lines.push(Line::from(vec![ + Span::styled( + "Services tracked: ", + Style::default().add_modifier(Modifier::BOLD), + ), + Span::raw(metrics.services.len().to_string()), + ])); + + frame.render_widget( + Paragraph::new(summary_lines) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + chunks[0], + ); + + if metrics.services.is_empty() { + frame.render_widget( + Paragraph::new("No services reported") + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + chunks[1], + ); + return; } - let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block); + let mut services = metrics.services.clone(); + services.sort_by(|a, b| { + status_weight(&a.status) + .cmp(&status_weight(&b.status)) + .then_with(|| a.name.cmp(&b.name)) + }); - frame.render_widget(paragraph, area); + let header = Row::new(vec![ + Cell::from(""), + Cell::from("Service"), + Cell::from("Memory"), + Cell::from("Disk"), + ]) + .style( + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ); + + let rows = services.into_iter().map(|svc| { + let row_style = status_style(&svc.status); + Row::new(vec![ + Cell::from(status_symbol(&svc.status)), + Cell::from(format_service_name(&svc.name)), + Cell::from(format_memory_value(svc.memory_used_mb, svc.memory_quota_mb)), + Cell::from(format_disk_value(svc.disk_used_gb)), + ]) + .style(row_style) + }); + + let table = Table::new(rows) + .header(header) + .style(Style::default().fg(Color::White)) + .widths(&[ + Constraint::Length(1), + Constraint::Length(10), + Constraint::Length(12), + Constraint::Length(8), + ]) + .column_spacing(2); + + frame.render_widget(table, chunks[1]); } + +fn status_weight(status: &ServiceStatus) -> i32 { + match status { + ServiceStatus::Stopped => 0, + ServiceStatus::Degraded => 1, + ServiceStatus::Restarting => 2, + ServiceStatus::Running => 3, + } +} + +fn status_symbol(status: &ServiceStatus) -> &'static str { + match status { + ServiceStatus::Running => "✔", + ServiceStatus::Degraded => "!", + ServiceStatus::Restarting => "↻", + ServiceStatus::Stopped => "✖", + } +} + +fn status_style(status: &ServiceStatus) -> Style { + match status { + ServiceStatus::Running => Style::default().fg(Color::White), + ServiceStatus::Degraded => Style::default().fg(Color::Yellow), + ServiceStatus::Restarting => Style::default().fg(Color::Yellow), + ServiceStatus::Stopped => Style::default().fg(Color::Red), + } +} + +fn summary_color(summary: &ServiceSummary) -> Color { + if summary.failed > 0 { + Color::Red + } else if summary.degraded > 0 { + Color::Yellow + } else { + Color::Green + } +} + +fn format_memory(summary: &ServiceSummary) -> String { + if summary.memory_quota_mb > 0.0 { + format!( + "{:.1}/{:.1} MiB", + summary.memory_used_mb, summary.memory_quota_mb + ) + } else { + format!("{:.1} MiB used", summary.memory_used_mb) + } +} + +fn format_memory_value(used: f32, quota: f32) -> String { + if quota > 0.05 { + format!("{:.1}/{:.1} MiB", used, quota) + } else if used > 0.05 { + format!("{:.1} MiB", used) + } else { + "—".to_string() + } +} + +fn format_disk_summary(used: f32, total: f32) -> String { + if total > 0.05 { + format!("{:.1}/{:.1} GiB", used, total) + } else if used > 0.05 { + format!("{:.1} GiB", used) + } else { + "—".to_string() + } +} + +fn format_disk_value(used: f32) -> String { + if used >= 1.0 { + format!("{:.1} GiB", used) + } else if used >= 0.001 { // 1 MB or more + format!("{:.0} MiB", used * 1024.0) + } else if used > 0.0 { + format!("<1 MiB") + } else { + "—".to_string() + } +} + +fn format_service_name(name: &str) -> String { + let mut truncated = String::with_capacity(10); + for ch in name.chars().take(10) { + truncated.push(ch); + } + truncated +} + +fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) { + let block = Block::default() + .title("Services") + .borders(Borders::ALL) + .border_style(Style::default().fg(Color::Yellow)) + .style(Style::default().fg(Color::White)); + let inner = block.inner(area); + frame.render_widget(block, area); + frame.render_widget( + Paragraph::new(Line::from(message)) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + inner, + ); +} + + diff --git a/dashboard/src/ui/storage.rs b/dashboard/src/ui/storage.rs new file mode 100644 index 0000000..41aa78c --- /dev/null +++ b/dashboard/src/ui/storage.rs @@ -0,0 +1,196 @@ +use ratatui::layout::{Constraint, Direction, Layout, Rect}; +use ratatui::style::{Color, Modifier, Style}; +use ratatui::text::{Line, Span}; +use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap}; +use ratatui::Frame; + +use crate::app::HostDisplayData; +use crate::data::metrics::SmartMetrics; + +pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { + match host { + Some(data) => { + if let Some(metrics) = data.smart.as_ref() { + render_metrics(frame, data, metrics, area); + } else { + render_placeholder( + frame, + area, + &format!("Host {} has no SMART data yet", data.name), + ); + } + } + None => render_placeholder(frame, area, "No hosts configured"), + } +} + +fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &SmartMetrics, area: Rect) { + let color = smart_status_color(&metrics.status); + let title = format!( + "Storage • ok:{} warn:{} crit:{}", + metrics.summary.healthy, metrics.summary.warning, metrics.summary.critical + ); + + let block = Block::default() + .title(Span::styled( + title, + Style::default().fg(color).add_modifier(Modifier::BOLD), + )) + .borders(Borders::ALL) + .border_style(Style::default().fg(color)) + .style(Style::default().fg(Color::White)); + + let inner = block.inner(area); + frame.render_widget(block, area); + + + let issue_count = metrics.issues.len(); + let body_constraints = if issue_count > 0 { + vec![Constraint::Min(1), Constraint::Length(2)] + } else { + vec![Constraint::Min(1)] + }; + + let body_chunks = Layout::default() + .direction(Direction::Vertical) + .constraints(body_constraints) + .split(inner); + + if metrics.drives.is_empty() { + frame.render_widget( + Paragraph::new("No drives reported") + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + body_chunks[0], + ); + } else { + let header = Row::new(vec![ + Cell::from("Drive"), + Cell::from("Temp"), + Cell::from("Wear"), + Cell::from("Spare"), + Cell::from("Hours"), + Cell::from("Capacity"), + Cell::from("Usage"), + ]) + .style( + Style::default() + .fg(Color::White) + .add_modifier(Modifier::BOLD), + ); + + let rows = metrics.drives.iter().map(|drive| { + Row::new(vec![ + Cell::from(format_drive_name(&drive.name)), + Cell::from(format_temperature(drive.temperature_c)), + Cell::from(format_percent(drive.wear_level)), + Cell::from(format_percent(drive.available_spare)), + Cell::from(drive.power_on_hours.to_string()), + Cell::from(format_capacity(drive.capacity_gb)), + Cell::from(format_usage(drive.used_gb, drive.capacity_gb)), + ]) + }); + + let table = Table::new(rows) + .header(header) + .style(Style::default().fg(Color::White)) + .widths(&[ + Constraint::Length(10), // Drive name + Constraint::Length(8), // Temp + Constraint::Length(8), // Wear + Constraint::Length(8), // Spare + Constraint::Length(10), // Hours + Constraint::Length(10), // Capacity + Constraint::Min(8), // Usage + ]) + .column_spacing(2); + + frame.render_widget(table, body_chunks[0]); + } + + if issue_count > 0 { + let issue_line = Line::from(vec![ + Span::styled("Issue: ", Style::default().fg(Color::Yellow)), + Span::styled( + metrics.issues[0].clone(), + Style::default().fg(Color::Yellow), + ), + ]); + + frame.render_widget( + Paragraph::new(issue_line) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + body_chunks[1], + ); + } +} + +fn smart_status_color(status: &str) -> Color { + match status.to_uppercase().as_str() { + "CRITICAL" => Color::Red, + "WARNING" => Color::Yellow, + _ => Color::Green, + } +} + +fn format_temperature(value: f32) -> String { + if value.abs() < f32::EPSILON { + "—".to_string() + } else { + format!("{:.0}°C", value) + } +} + +fn format_percent(value: f32) -> String { + if value.abs() < f32::EPSILON { + "—".to_string() + } else { + format!("{:.0}%", value) + } +} + +fn format_drive_name(name: &str) -> String { + let mut truncated = String::with_capacity(10); + for ch in name.chars().take(10) { + truncated.push(ch); + } + truncated +} + +fn format_capacity(value: Option) -> String { + match value { + Some(gb) if gb > 0.0 => format!("{:.0}G", gb), + _ => "—".to_string(), + } +} + +fn format_usage(used: Option, capacity: Option) -> String { + match (used, capacity) { + (Some(used_gb), Some(total_gb)) if used_gb > 0.0 && total_gb > 0.0 => { + let percent = (used_gb / total_gb) * 100.0; + format!("{:.0}G ({:.0}%)", used_gb, percent) + } + (Some(used_gb), None) if used_gb > 0.0 => { + format!("{:.0}G", used_gb) + } + _ => "—".to_string(), + } +} + + +fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) { + let block = Block::default() + .title("Storage") + .borders(Borders::ALL) + .border_style(Style::default().fg(Color::LightCyan)) + .style(Style::default().fg(Color::White)); + let inner = block.inner(area); + frame.render_widget(block, area); + frame.render_widget( + Paragraph::new(Line::from(message)) + .wrap(Wrap { trim: true }) + .style(Style::default().fg(Color::White)), + inner, + ); +} diff --git a/shared/src/envelope.rs b/shared/src/envelope.rs index a509afc..1847e52 100644 --- a/shared/src/envelope.rs +++ b/shared/src/envelope.rs @@ -17,3 +17,6 @@ pub struct MetricsEnvelope { #[serde(default)] pub metrics: Value, } + +// Alias for backward compatibility +pub type MessageEnvelope = MetricsEnvelope;