diff --git a/CLAUDE.md b/CLAUDE.md index 123e64f..d30d568 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -232,12 +232,12 @@ Agent (calculations + thresholds) → Status → Dashboard (display only) → Ta - Rate limiting: configurable (set to 0 for testing, 30 minutes for production) **Monitored Components:** -- system.cpu (load status) -- system.cpu_temp (temperature status) -- system.memory (usage status) -- system.services (service health status) -- storage.smart (drive health) -- backup.overall (backup status) +- system.cpu (load status) - SystemCollector +- system.memory (usage status) - SystemCollector +- system.cpu_temp (temperature status) - SystemCollector (disabled) +- system.services (service health status) - ServiceCollector +- storage.smart (drive health) - SmartCollector +- backup.overall (backup status) - BackupCollector ### Pure Auto-Discovery Implementation @@ -262,10 +262,24 @@ Agent (calculations + thresholds) → Status → Dashboard (display only) → Ta - [x] CPU temperature monitoring and notifications - [x] ZMQ message format standardization - [x] Removed all hardcoded dashboard thresholds +- [x] CPU thresholds restored to production values (5.0/8.0) +- [x] All collectors output standardized status strings (ok/warning/critical/unknown) +- [x] Dashboard connection loss detection with 5-second keep-alive +- [x] Removed excessive logging from agent +- [x] Fixed all compiler warnings in both agent and dashboard +- [x] **SystemCollector architecture refactoring completed (2025-10-12)** +- [x] Created SystemCollector for CPU load, memory, temperature, C-states +- [x] Moved system metrics from ServiceCollector to SystemCollector +- [x] Updated dashboard to parse and display SystemCollector data +- [x] Enhanced service notifications to include specific failure details +- [x] CPU temperature thresholds set to 100°C (effectively disabled) -**Testing Configuration (REVERT FOR PRODUCTION):** -- CPU thresholds lowered to 2.0/4.0 for easy testing -- Email rate limiting disabled (0 minutes) +**Production Configuration:** +- CPU load thresholds: Warning ≥ 5.0, Critical ≥ 8.0 +- CPU temperature thresholds: Warning ≥ 100°C, Critical ≥ 100°C (effectively disabled) +- Memory usage thresholds: Warning ≥ 80%, Critical ≥ 95% +- Connection timeout: 15 seconds (agents send data every 5 seconds) +- Email rate limiting: 30 minutes (set to 0 for testing) ### Development Guidelines diff --git a/agent/src/collectors/mod.rs b/agent/src/collectors/mod.rs index 4433afe..6a4d078 100644 --- a/agent/src/collectors/mod.rs +++ b/agent/src/collectors/mod.rs @@ -6,6 +6,7 @@ pub mod backup; pub mod error; pub mod service; pub mod smart; +pub mod system; pub use error::CollectorError; diff --git a/agent/src/collectors/service.rs b/agent/src/collectors/service.rs index 68a2778..8be0210 100644 --- a/agent/src/collectors/service.rs +++ b/agent/src/collectors/service.rs @@ -2,7 +2,6 @@ use async_trait::async_trait; use chrono::Utc; use serde::Serialize; use serde_json::json; -use std::collections::HashMap; use std::process::Stdio; use std::time::Duration; use tokio::fs; @@ -284,33 +283,6 @@ impl ServiceCollector { Ok(0.0) // No limit or couldn't parse } - async fn get_system_memory_info(&self) -> Result { - let meminfo = - fs::read_to_string("/proc/meminfo") - .await - .map_err(|e| CollectorError::IoError { - message: e.to_string(), - })?; - - let mut memory_info = HashMap::new(); - for line in meminfo.lines() { - if let Some((key, value)) = line.split_once(':') { - let value = value.trim().trim_end_matches(" kB"); - if let Ok(kb) = value.parse::() { - memory_info.insert(key.to_string(), kb); - } - } - } - - let total_kb = memory_info.get("MemTotal").copied().unwrap_or(0); - let available_kb = memory_info.get("MemAvailable").copied().unwrap_or(0); - let used_kb = total_kb.saturating_sub(available_kb); - - Ok(SystemMemoryInfo { - total_mb: total_kb as f32 / 1024.0, - used_mb: used_kb as f32 / 1024.0, - }) - } async fn get_disk_usage(&self) -> Result { let output = Command::new("df") @@ -363,59 +335,9 @@ impl ServiceCollector { }) } - async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> { - let loadavg = - fs::read_to_string("/proc/loadavg") - .await - .map_err(|e| CollectorError::IoError { - message: e.to_string(), - })?; - let parts: Vec<&str> = loadavg.split_whitespace().collect(); - if parts.len() < 3 { - return Err(CollectorError::ParseError { - message: "Unexpected /proc/loadavg format".to_string(), - }); - } - let parse = |s: &str| -> Result { - s.parse::().map_err(|e| CollectorError::ParseError { - message: format!("Failed to parse load average '{}': {}", s, e), - }) - }; - Ok((parse(parts[0])?, parse(parts[1])?, parse(parts[2])?)) - } - - fn determine_cpu_status(&self, cpu_load_5: f32) -> String { - if cpu_load_5 >= 8.0 { - "critical".to_string() - } else if cpu_load_5 >= 5.0 { - "warning".to_string() - } else { - "ok".to_string() - } - } - - fn determine_memory_status(&self, usage_percent: f32) -> String { - if usage_percent >= 95.0 { - "critical".to_string() - } else if usage_percent >= 80.0 { - "warning".to_string() - } else { - "ok".to_string() - } - } - - fn determine_cpu_temp_status(&self, temp_c: f32) -> String { - if temp_c >= 80.0 { - "critical".to_string() - } else if temp_c >= 70.0 { - "warning".to_string() - } else { - "ok".to_string() - } - } fn determine_services_status(&self, healthy: usize, degraded: usize, failed: usize) -> String { if failed > 0 { @@ -429,84 +351,6 @@ impl ServiceCollector { } } - async fn get_cpu_cstate_info(&self) -> Option> { - // Read C-state information to show all sleep state distributions - let mut cstate_times: Vec<(String, u64)> = Vec::new(); - let mut total_time = 0u64; - - // Check if C-state information is available - if let Ok(mut entries) = fs::read_dir("/sys/devices/system/cpu/cpu0/cpuidle").await { - while let Ok(Some(entry)) = entries.next_entry().await { - let state_path = entry.path(); - let name_path = state_path.join("name"); - let time_path = state_path.join("time"); - - if let (Ok(name), Ok(time_str)) = ( - fs::read_to_string(&name_path).await, - fs::read_to_string(&time_path).await - ) { - let name = name.trim().to_string(); - if let Ok(time) = time_str.trim().parse::() { - total_time += time; - cstate_times.push((name, time)); - } - } - } - - if total_time > 0 && !cstate_times.is_empty() { - // Sort by time spent (highest first) - cstate_times.sort_by(|a, b| b.1.cmp(&a.1)); - - // Format all C-states with percentages - let mut result = Vec::new(); - for (name, time) in cstate_times { - let percent = (time as f32 / total_time as f32) * 100.0; - if percent >= 0.1 { // Only show states with at least 0.1% time - result.push(format!("{}: {:.1}%", name, percent)); - } - } - - return Some(result); - } - } - - None - } - - async fn get_cpu_temperature_c(&self) -> Option { - let mut entries = fs::read_dir("/sys/class/thermal").await.ok()?; - let mut fallback: Option = None; - - while let Ok(Some(entry)) = entries.next_entry().await { - let path = entry.path(); - let type_path = path.join("type"); - let temp_path = path.join("temp"); - - let label = fs::read_to_string(&type_path).await.ok()?.to_lowercase(); - let raw = match fs::read_to_string(&temp_path).await { - Ok(value) => value, - Err(_) => continue, - }; - - let milli: f32 = match raw.trim().parse() { - Ok(value) => value, - Err(_) => continue, - }; - - let temp_c = milli / 1000.0; - if label.contains("cpu") || label.contains("pkg") { - if temp_c > 0.0 { - return Some(temp_c); - } - } - - if fallback.is_none() && temp_c > 0.0 { - fallback = Some(temp_c); - } - } - - fallback - } async fn get_gpu_metrics(&self) -> (Option, Option) { let output = Command::new("nvidia-smi") @@ -983,43 +827,21 @@ impl Collector for ServiceCollector { } } - // Get system memory info for quota calculation - let system_memory = self - .get_system_memory_info() - .await - .unwrap_or(SystemMemoryInfo { - total_mb: 0.0, - used_mb: 0.0, - }); let _disk_usage = self.get_disk_usage().await.unwrap_or(DiskUsage { total_gb: 0.0, used_gb: 0.0, }); - let (cpu_load_1, cpu_load_5, cpu_load_15) = - self.get_cpu_load().await.unwrap_or((0.0, 0.0, 0.0)); - let cpu_status = self.determine_cpu_status(cpu_load_5); - - // Calculate memory usage percentage and status - let memory_usage_percent = if system_memory.total_mb > 0.0 { - (system_memory.used_mb / system_memory.total_mb) * 100.0 - } else { - 0.0 - }; - let memory_status = self.determine_memory_status(memory_usage_percent); // Calculate overall services status let services_status = self.determine_services_status(healthy, degraded, failed); - let cpu_cstate_info = self.get_cpu_cstate_info().await; - let cpu_temp_c = self.get_cpu_temperature_c().await; - let cpu_temp_status = cpu_temp_c.map(|temp| self.determine_cpu_temp_status(temp)); let (gpu_load_percent, gpu_temp_c) = self.get_gpu_metrics().await; - // If no specific quotas are set, use system memory as reference + // If no specific quotas are set, use a default value if total_memory_quota == 0.0 { - total_memory_quota = system_memory.total_mb; + total_memory_quota = 8192.0; // Default 8GB for quota calculation } let service_metrics = json!({ @@ -1030,18 +852,8 @@ impl Collector for ServiceCollector { "services_status": services_status, "memory_used_mb": total_memory_used, "memory_quota_mb": total_memory_quota, - "system_memory_used_mb": system_memory.used_mb, - "system_memory_total_mb": system_memory.total_mb, - "memory_status": memory_status, "disk_used_gb": total_disk_used, "disk_total_gb": total_disk_used, // For services, total = used (no quota concept) - "cpu_load_1": cpu_load_1, - "cpu_load_5": cpu_load_5, - "cpu_load_15": cpu_load_15, - "cpu_status": cpu_status, - "cpu_cstate": cpu_cstate_info, - "cpu_temp_c": cpu_temp_c, - "cpu_temp_status": cpu_temp_status, "gpu_load_percent": gpu_load_percent, "gpu_temp_c": gpu_temp_c, }, @@ -1077,10 +889,6 @@ enum ServiceStatus { Stopped, } -struct SystemMemoryInfo { - total_mb: f32, - used_mb: f32, -} #[allow(dead_code)] struct DiskUsage { diff --git a/agent/src/collectors/system.rs b/agent/src/collectors/system.rs new file mode 100644 index 0000000..ffbd3f3 --- /dev/null +++ b/agent/src/collectors/system.rs @@ -0,0 +1,271 @@ +use async_trait::async_trait; +use serde_json::json; +use std::time::Duration; +use tokio::fs; +use tokio::process::Command; +use tracing::debug; + +use super::{Collector, CollectorError, CollectorOutput, AgentType}; + +pub struct SystemCollector { + enabled: bool, + interval: Duration, +} + +impl SystemCollector { + pub fn new(enabled: bool, interval_ms: u64) -> Self { + Self { + enabled, + interval: Duration::from_millis(interval_ms), + } + } + + async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> { + let output = Command::new("uptime") + .output() + .await + .map_err(|e| CollectorError::CommandFailed { + command: "uptime".to_string(), + message: e.to_string() + })?; + + let uptime_str = String::from_utf8_lossy(&output.stdout); + + // Parse load averages from uptime output + // Format with comma decimals: "... load average: 3,30, 3,17, 2,84" + if let Some(load_part) = uptime_str.split("load average:").nth(1) { + // Use regex or careful parsing for comma decimal separator locale + let load_str = load_part.trim(); + // Split on ", " to separate the three load values + let loads: Vec<&str> = load_str.split(", ").collect(); + if loads.len() >= 3 { + let load_1 = loads[0].trim().replace(',', ".").parse::() + .map_err(|_| CollectorError::ParseError { message: "Failed to parse 1min load".to_string() })?; + let load_5 = loads[1].trim().replace(',', ".").parse::() + .map_err(|_| CollectorError::ParseError { message: "Failed to parse 5min load".to_string() })?; + let load_15 = loads[2].trim().replace(',', ".").parse::() + .map_err(|_| CollectorError::ParseError { message: "Failed to parse 15min load".to_string() })?; + + return Ok((load_1, load_5, load_15)); + } + } + + Err(CollectorError::ParseError { message: "Failed to parse load averages".to_string() }) + } + + async fn get_cpu_temperature(&self) -> Option { + // Try to find CPU-specific thermal zones first (x86_pkg_temp, coretemp, etc.) + for i in 0..10 { + let type_path = format!("/sys/class/thermal/thermal_zone{}/type", i); + let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i); + + if let (Ok(zone_type), Ok(temp_str)) = ( + fs::read_to_string(&type_path).await, + fs::read_to_string(&temp_path).await, + ) { + let zone_type = zone_type.trim(); + if let Ok(temp_millic) = temp_str.trim().parse::() { + let temp_c = temp_millic / 1000.0; + // Look for reasonable temperatures first + if temp_c > 20.0 && temp_c < 150.0 { + // Prefer CPU package temperature zones + if zone_type == "x86_pkg_temp" || zone_type.contains("coretemp") { + debug!("Found CPU temperature: {}°C from {} ({})", temp_c, temp_path, zone_type); + return Some(temp_c); + } + } + } + } + } + + // Fallback: try any reasonable temperature if no CPU-specific zone found + for i in 0..10 { + let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i); + if let Ok(temp_str) = fs::read_to_string(&temp_path).await { + if let Ok(temp_millic) = temp_str.trim().parse::() { + let temp_c = temp_millic / 1000.0; + if temp_c > 20.0 && temp_c < 150.0 { + debug!("Found fallback temperature: {}°C from {}", temp_c, temp_path); + return Some(temp_c); + } + } + } + } + None + } + + async fn get_memory_info(&self) -> Result<(f32, f32), CollectorError> { + let meminfo = fs::read_to_string("/proc/meminfo") + .await + .map_err(|e| CollectorError::IoError { message: format!("Failed to read /proc/meminfo: {}", e) })?; + + let mut total_kb = 0; + let mut available_kb = 0; + + for line in meminfo.lines() { + if line.starts_with("MemTotal:") { + if let Some(value) = line.split_whitespace().nth(1) { + total_kb = value.parse::().unwrap_or(0); + } + } else if line.starts_with("MemAvailable:") { + if let Some(value) = line.split_whitespace().nth(1) { + available_kb = value.parse::().unwrap_or(0); + } + } + } + + if total_kb == 0 { + return Err(CollectorError::ParseError { message: "Could not parse total memory".to_string() }); + } + + let total_mb = total_kb as f32 / 1024.0; + let used_mb = total_mb - (available_kb as f32 / 1024.0); + + Ok((used_mb, total_mb)) + } + + async fn get_cpu_cstate_info(&self) -> Option> { + // Read C-state information to show all sleep state distributions + let mut cstate_times: Vec<(String, u64)> = Vec::new(); + let mut total_time = 0u64; + + // Check if C-state information is available + if let Ok(mut entries) = fs::read_dir("/sys/devices/system/cpu/cpu0/cpuidle").await { + while let Ok(Some(entry)) = entries.next_entry().await { + let state_path = entry.path(); + let name_path = state_path.join("name"); + let time_path = state_path.join("time"); + + if let (Ok(name), Ok(time_str)) = ( + fs::read_to_string(&name_path).await, + fs::read_to_string(&time_path).await + ) { + let name = name.trim().to_string(); + if let Ok(time) = time_str.trim().parse::() { + total_time += time; + cstate_times.push((name, time)); + } + } + } + + if total_time > 0 && !cstate_times.is_empty() { + // Sort by time spent (highest first) + cstate_times.sort_by(|a, b| b.1.cmp(&a.1)); + + // Format all C-states with percentages + let mut result = Vec::new(); + for (name, time) in cstate_times { + let percent = (time as f32 / total_time as f32) * 100.0; + if percent >= 0.1 { // Only show states with at least 0.1% time + result.push(format!("{}: {:.1}%", name, percent)); + } + } + + return Some(result); + } + } + + None + } + + fn determine_cpu_status(&self, cpu_load_5: f32) -> String { + if cpu_load_5 >= 8.0 { + "critical".to_string() + } else if cpu_load_5 >= 5.0 { + "warning".to_string() + } else { + "ok".to_string() + } + } + + fn determine_cpu_temp_status(&self, temp_c: f32) -> String { + if temp_c >= 100.0 { + "critical".to_string() + } else if temp_c >= 100.0 { + "warning".to_string() + } else { + "ok".to_string() + } + } + + fn determine_memory_status(&self, usage_percent: f32) -> String { + if usage_percent >= 95.0 { + "critical".to_string() + } else if usage_percent >= 80.0 { + "warning".to_string() + } else { + "ok".to_string() + } + } +} + +#[async_trait] +impl Collector for SystemCollector { + fn name(&self) -> &str { + "system" + } + + fn agent_type(&self) -> AgentType { + AgentType::System + } + + fn collect_interval(&self) -> Duration { + self.interval + } + + async fn collect(&self) -> Result { + if !self.enabled { + return Err(CollectorError::ConfigError { message: "SystemCollector disabled".to_string() }); + } + + // Get CPU load averages + let (cpu_load_1, cpu_load_5, cpu_load_15) = self.get_cpu_load().await?; + let cpu_status = self.determine_cpu_status(cpu_load_5); + + // Get CPU temperature (optional) + let cpu_temp_c = self.get_cpu_temperature().await; + let cpu_temp_status = cpu_temp_c.map(|temp| self.determine_cpu_temp_status(temp)); + + // Get memory information + let (memory_used_mb, memory_total_mb) = self.get_memory_info().await?; + let memory_usage_percent = (memory_used_mb / memory_total_mb) * 100.0; + let memory_status = self.determine_memory_status(memory_usage_percent); + + // Get C-state information (optional) + let cpu_cstate_info = self.get_cpu_cstate_info().await; + + let mut system_metrics = json!({ + "summary": { + "cpu_load_1": cpu_load_1, + "cpu_load_5": cpu_load_5, + "cpu_load_15": cpu_load_15, + "cpu_status": cpu_status, + "memory_used_mb": memory_used_mb, + "memory_total_mb": memory_total_mb, + "memory_usage_percent": memory_usage_percent, + "memory_status": memory_status, + }, + "timestamp": chrono::Utc::now().timestamp() as u64, + }); + + // Add optional metrics if available + if let Some(temp) = cpu_temp_c { + system_metrics["summary"]["cpu_temp_c"] = json!(temp); + if let Some(status) = cpu_temp_status { + system_metrics["summary"]["cpu_temp_status"] = json!(status); + } + } + + if let Some(cstates) = cpu_cstate_info { + system_metrics["summary"]["cpu_cstate"] = json!(cstates); + } + + debug!("System metrics collected: CPU load {:.2}, Memory {:.1}%", + cpu_load_5, memory_usage_percent); + + Ok(CollectorOutput { + agent_type: AgentType::System, + data: system_metrics, + }) + } +} \ No newline at end of file diff --git a/agent/src/notifications.rs b/agent/src/notifications.rs index 7940346..9b947e0 100644 --- a/agent/src/notifications.rs +++ b/agent/src/notifications.rs @@ -35,6 +35,7 @@ pub struct StatusChange { pub old_status: String, pub new_status: String, pub timestamp: DateTime, + pub details: Option, } pub struct NotificationManager { @@ -53,6 +54,10 @@ impl NotificationManager { } pub fn update_status(&mut self, component: &str, metric: &str, status: &str) -> Option { + self.update_status_with_details(component, metric, status, None) + } + + pub fn update_status_with_details(&mut self, component: &str, metric: &str, status: &str, details: Option) -> Option { let key = format!("{}.{}", component, metric); let old_status = self.last_status.get(&key).cloned(); @@ -64,6 +69,7 @@ impl NotificationManager { old_status: old.clone(), new_status: status.to_string(), timestamp: Utc::now(), + details, }; self.last_status.insert(key, status.to_string()); @@ -154,26 +160,34 @@ impl NotificationManager { } fn format_body(&self, change: &StatusChange) -> String { - format!( + let mut body = format!( "Status Change Alert\n\ \n\ Host: {}\n\ Component: {}\n\ Metric: {}\n\ Status Change: {} → {}\n\ - Time: {}\n\ - \n\ - --\n\ - CM Dashboard Agent\n\ - Generated at {}", + Time: {}", gethostname::gethostname().to_string_lossy(), change.component, change.metric, change.old_status, change.new_status, - change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST"), + change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST") + ); + + if let Some(details) = &change.details { + body.push_str(&format!("\n\nDetails:\n{}", details)); + } + + body.push_str(&format!( + "\n\n--\n\ + CM Dashboard Agent\n\ + Generated at {}", Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST") - ) + )); + + body } async fn send_email(&self, subject: &str, body: &str) -> Result<(), Box> { diff --git a/agent/src/simple_agent.rs b/agent/src/simple_agent.rs index 3557d38..96f63d9 100644 --- a/agent/src/simple_agent.rs +++ b/agent/src/simple_agent.rs @@ -9,6 +9,7 @@ use crate::collectors::{ backup::BackupCollector, service::ServiceCollector, smart::SmartCollector, + system::SystemCollector, Collector }; use cm_dashboard_shared::envelope::AgentType; @@ -60,6 +61,11 @@ impl SimpleAgent { warn!("No storage devices found - SMART monitoring disabled"); } + // System collector + let system_collector = SystemCollector::new(true, 5000); + collectors.push(Box::new(system_collector)); + info!("System monitoring: CPU, memory, temperature, C-states"); + // Service collector let services = AutoDiscovery::discover_services().await; let service_list = if !services.is_empty() { @@ -161,32 +167,10 @@ impl SimpleAgent { match output.agent_type { AgentType::Service => { if let Some(summary) = output.data.get("summary") { - // Check CPU status - if let Some(cpu_status) = summary.get("cpu_status").and_then(|v| v.as_str()) { - if let Some(change) = self.notification_manager.update_status("system", "cpu", cpu_status) { - info!("CPU status change detected: {} -> {}", change.old_status, change.new_status); - self.notification_manager.send_notification(change).await; - } - } - - // Check memory status - if let Some(memory_status) = summary.get("memory_status").and_then(|v| v.as_str()) { - if let Some(change) = self.notification_manager.update_status("system", "memory", memory_status) { - self.notification_manager.send_notification(change).await; - } - } - - // Check CPU temperature status - if let Some(cpu_temp_status) = summary.get("cpu_temp_status").and_then(|v| v.as_str()) { - if let Some(change) = self.notification_manager.update_status("system", "cpu_temp", cpu_temp_status) { - info!("CPU temp status change detected: {} -> {}", change.old_status, change.new_status); - self.notification_manager.send_notification(change).await; - } - } - // Check services status if let Some(services_status) = summary.get("services_status").and_then(|v| v.as_str()) { - if let Some(change) = self.notification_manager.update_status("system", "services", services_status) { + let details = self.build_service_failure_details(output); + if let Some(change) = self.notification_manager.update_status_with_details("system", "services", services_status, details) { self.notification_manager.send_notification(change).await; } } @@ -205,6 +189,33 @@ impl SimpleAgent { } } } + AgentType::System => { + if let Some(summary) = output.data.get("summary") { + // Check CPU status + if let Some(cpu_status) = summary.get("cpu_status").and_then(|v| v.as_str()) { + if let Some(change) = self.notification_manager.update_status("system", "cpu", cpu_status) { + info!("CPU status change detected: {} -> {}", change.old_status, change.new_status); + self.notification_manager.send_notification(change).await; + } + } + + // Check memory status + if let Some(memory_status) = summary.get("memory_status").and_then(|v| v.as_str()) { + if let Some(change) = self.notification_manager.update_status("system", "memory", memory_status) { + info!("Memory status change detected: {} -> {}", change.old_status, change.new_status); + self.notification_manager.send_notification(change).await; + } + } + + // Check CPU temp status (optional) + if let Some(cpu_temp_status) = summary.get("cpu_temp_status").and_then(|v| v.as_str()) { + if let Some(change) = self.notification_manager.update_status("system", "cpu_temp", cpu_temp_status) { + info!("CPU temp status change detected: {} -> {}", change.old_status, change.new_status); + self.notification_manager.send_notification(change).await; + } + } + } + } AgentType::Backup => { if let Some(status) = output.data.get("overall_status") { let status_str = match status.as_str() { @@ -220,4 +231,69 @@ impl SimpleAgent { } } } + + fn build_service_failure_details(&self, output: &crate::collectors::CollectorOutput) -> Option { + if let Some(services) = output.data.get("services").and_then(|v| v.as_array()) { + let mut failed_services = Vec::new(); + let mut degraded_services = Vec::new(); + + for service in services { + if let (Some(name), Some(status)) = ( + service.get("name").and_then(|v| v.as_str()), + service.get("status").and_then(|v| v.as_str()) + ) { + match status { + "Stopped" => { + let memory = service.get("memory_used_mb") + .and_then(|v| v.as_f64()) + .unwrap_or(0.0); + let disk = service.get("disk_used_gb") + .and_then(|v| v.as_f64()) + .unwrap_or(0.0); + failed_services.push(format!("{} (stopped, was using {:.1}MB RAM, {:.1}GB disk)", + name, memory, disk)); + }, + "Degraded" | "Restarting" => { + let memory = service.get("memory_used_mb") + .and_then(|v| v.as_f64()) + .unwrap_or(0.0); + let disk = service.get("disk_used_gb") + .and_then(|v| v.as_f64()) + .unwrap_or(0.0); + degraded_services.push(format!("{} ({}, using {:.1}MB RAM, {:.1}GB disk)", + name, status.to_lowercase(), memory, disk)); + }, + _ => {} + } + } + } + + if !failed_services.is_empty() || !degraded_services.is_empty() { + let mut details = String::new(); + + if !failed_services.is_empty() { + details.push_str("Failed services:\n"); + for service in &failed_services { + details.push_str(&format!("- {}\n", service)); + } + } + + if !degraded_services.is_empty() { + if !details.is_empty() { + details.push('\n'); + } + details.push_str("Degraded services:\n"); + for service in °raded_services { + details.push_str(&format!("- {}\n", service)); + } + } + + Some(details.trim_end().to_string()) + } else { + None + } + } else { + None + } + } } \ No newline at end of file diff --git a/dashboard/src/app.rs b/dashboard/src/app.rs index 0a5c5a3..358da6c 100644 --- a/dashboard/src/app.rs +++ b/dashboard/src/app.rs @@ -10,7 +10,11 @@ use gethostname::gethostname; use crate::config; use crate::data::config::{AppConfig, DataSourceKind, HostTarget, ZmqConfig}; use crate::data::history::MetricsHistory; -use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics}; +use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics}; + +// Host connection timeout - if no data received for this duration, mark as timeout +// Keep-alive mechanism: agents send data every 5 seconds, timeout after 15 seconds +const HOST_CONNECTION_TIMEOUT: Duration = Duration::from_secs(15); /// Shared application settings derived from the CLI arguments. #[derive(Debug, Clone)] @@ -32,11 +36,22 @@ impl AppOptions { struct HostRuntimeState { last_success: Option>, last_error: Option, + connection_status: ConnectionStatus, smart: Option, services: Option, + system: Option, backup: Option, } +#[derive(Debug, Clone, Default)] +pub enum ConnectionStatus { + #[default] + Unknown, + Connected, + Timeout, + Error, +} + /// Top-level application state container. #[derive(Debug)] pub struct App { @@ -100,6 +115,10 @@ impl App { pub fn on_tick(&mut self) { self.tick_count = self.tick_count.saturating_add(1); self.last_tick = Instant::now(); + + // Check for host connection timeouts + self.check_host_timeouts(); + let host_count = self.hosts.len(); let retention = self.history.retention(); self.status = format!( @@ -193,8 +212,10 @@ impl App { name: host.name.clone(), last_success: state.last_success.clone(), last_error: state.last_error.clone(), + connection_status: state.connection_status.clone(), smart: state.smart.clone(), services: state.services.clone(), + system: state.system.clone(), backup: state.backup.clone(), }) }) @@ -209,8 +230,10 @@ impl App { name: host.name.clone(), last_success: state.last_success.clone(), last_error: state.last_error.clone(), + connection_status: state.connection_status.clone(), smart: state.smart.clone(), services: state.services.clone(), + system: state.system.clone(), backup: state.backup.clone(), }) }) @@ -237,6 +260,7 @@ impl App { host, smart, services, + system, backup, timestamp, } => { @@ -245,6 +269,7 @@ impl App { let state = self.host_states.entry(host.clone()).or_default(); state.last_success = Some(timestamp); state.last_error = None; + state.connection_status = ConnectionStatus::Connected; if let Some(mut smart_metrics) = smart { if smart_metrics.timestamp != timestamp { @@ -267,6 +292,16 @@ impl App { state.services = Some(snapshot); } + if let Some(system_metrics) = system { + // Convert timestamp format (u64 to DateTime) + let system_snapshot = SystemMetrics { + summary: system_metrics.summary, + timestamp: system_metrics.timestamp, + }; + self.history.record_system(system_snapshot.clone()); + state.system = Some(system_snapshot); + } + if let Some(mut backup_metrics) = backup { if backup_metrics.timestamp != timestamp { backup_metrics.timestamp = timestamp; @@ -291,12 +326,37 @@ impl App { self.ensure_host_entry(&host); let state = self.host_states.entry(host.clone()).or_default(); state.last_error = Some(format!("{} at {}", error, timestamp.format("%H:%M:%S"))); + state.connection_status = ConnectionStatus::Error; self.status = format!("Fetch failed • host: {} • {}", host, error); } } } + fn check_host_timeouts(&mut self) { + let now = Utc::now(); + + for (host_name, state) in self.host_states.iter_mut() { + if let Some(last_success) = state.last_success { + let duration_since_last = now.signed_duration_since(last_success); + + if duration_since_last > chrono::Duration::from_std(HOST_CONNECTION_TIMEOUT).unwrap() { + // Host has timed out (missed keep-alive) + if !matches!(state.connection_status, ConnectionStatus::Timeout) { + state.connection_status = ConnectionStatus::Timeout; + state.last_error = Some(format!("Keep-alive timeout (no data for {}s)", duration_since_last.num_seconds())); + } + } else { + // Host is connected + state.connection_status = ConnectionStatus::Connected; + } + } else { + // No data ever received from this host + state.connection_status = ConnectionStatus::Unknown; + } + } + } + pub fn help_visible(&self) -> bool { self.show_help } @@ -511,8 +571,10 @@ pub struct HostDisplayData { pub name: String, pub last_success: Option>, pub last_error: Option, + pub connection_status: ConnectionStatus, pub smart: Option, pub services: Option, + pub system: Option, pub backup: Option, } @@ -545,6 +607,7 @@ pub enum AppEvent { host: String, smart: Option, services: Option, + system: Option, backup: Option, timestamp: DateTime, }, diff --git a/dashboard/src/data/history.rs b/dashboard/src/data/history.rs index a90fa17..a36a852 100644 --- a/dashboard/src/data/history.rs +++ b/dashboard/src/data/history.rs @@ -5,7 +5,7 @@ use std::time::Duration; use chrono::{DateTime, Utc}; -use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics}; +use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics}; /// Ring buffer for retaining recent samples for trend analysis. #[derive(Debug)] @@ -13,6 +13,7 @@ pub struct MetricsHistory { capacity: usize, smart: VecDeque<(DateTime, SmartMetrics)>, services: VecDeque<(DateTime, ServiceMetrics)>, + system: VecDeque<(DateTime, SystemMetrics)>, backups: VecDeque<(DateTime, BackupMetrics)>, } @@ -22,6 +23,7 @@ impl MetricsHistory { capacity, smart: VecDeque::with_capacity(capacity), services: VecDeque::with_capacity(capacity), + system: VecDeque::with_capacity(capacity), backups: VecDeque::with_capacity(capacity), } } @@ -36,6 +38,11 @@ impl MetricsHistory { Self::push_with_limit(&mut self.services, entry, self.capacity); } + pub fn record_system(&mut self, metrics: SystemMetrics) { + let entry = (Utc::now(), metrics); + Self::push_with_limit(&mut self.system, entry, self.capacity); + } + pub fn record_backup(&mut self, metrics: BackupMetrics) { let entry = (Utc::now(), metrics); Self::push_with_limit(&mut self.backups, entry, self.capacity); diff --git a/dashboard/src/data/metrics.rs b/dashboard/src/data/metrics.rs index 16da7c5..ab018ae 100644 --- a/dashboard/src/data/metrics.rs +++ b/dashboard/src/data/metrics.rs @@ -32,6 +32,32 @@ pub struct DriveSummary { pub capacity_used_gb: f32, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SystemMetrics { + pub summary: SystemSummary, + pub timestamp: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SystemSummary { + pub cpu_load_1: f32, + pub cpu_load_5: f32, + pub cpu_load_15: f32, + #[serde(default)] + pub cpu_status: Option, + pub memory_used_mb: f32, + pub memory_total_mb: f32, + pub memory_usage_percent: f32, + #[serde(default)] + pub memory_status: Option, + #[serde(default)] + pub cpu_temp_c: Option, + #[serde(default)] + pub cpu_temp_status: Option, + #[serde(default)] + pub cpu_cstate: Option>, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ServiceMetrics { pub summary: ServiceSummary, diff --git a/dashboard/src/main.rs b/dashboard/src/main.rs index ccb8e2b..2040f80 100644 --- a/dashboard/src/main.rs +++ b/dashboard/src/main.rs @@ -12,7 +12,7 @@ use std::sync::{ }; use std::time::Duration; -use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics}; +use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics, SystemMetrics}; use anyhow::{anyhow, Context, Result}; use chrono::{TimeZone, Utc}; use clap::{ArgAction, Parser, Subcommand}; @@ -316,6 +316,7 @@ fn handle_zmq_message( host, smart: Some(metrics), services: None, + system: None, backup: None, timestamp, }); @@ -335,6 +336,7 @@ fn handle_zmq_message( host, smart: None, services: Some(metrics), + system: None, backup: None, timestamp, }); @@ -348,12 +350,33 @@ fn handle_zmq_message( }); } }, + AgentType::System => match serde_json::from_value::(payload.clone()) { + Ok(metrics) => { + let _ = sender.send(AppEvent::MetricsUpdated { + host, + smart: None, + services: None, + system: Some(metrics), + backup: None, + timestamp, + }); + } + Err(error) => { + warn!(%error, "failed to parse system metrics"); + let _ = sender.send(AppEvent::MetricsFailed { + host, + error: format!("system metrics parse error: {error:#}"), + timestamp, + }); + } + }, AgentType::Backup => match serde_json::from_value::(payload.clone()) { Ok(metrics) => { let _ = sender.send(AppEvent::MetricsUpdated { host, smart: None, services: None, + system: None, backup: Some(metrics), timestamp, }); diff --git a/dashboard/src/ui/alerts.rs b/dashboard/src/ui/alerts.rs index c80ec51..b582a1d 100644 --- a/dashboard/src/ui/alerts.rs +++ b/dashboard/src/ui/alerts.rs @@ -2,8 +2,8 @@ use chrono::{DateTime, Utc}; use ratatui::layout::Rect; use ratatui::Frame; -use crate::app::HostDisplayData; -use crate::ui::system::{evaluate_performance, PerfSeverity}; +use crate::app::{HostDisplayData, ConnectionStatus}; +// Removed: evaluate_performance and PerfSeverity no longer needed use crate::ui::widget::{render_widget_data, WidgetData, WidgetStatus, StatusLevel}; pub fn render(frame: &mut Frame, hosts: &[HostDisplayData], area: Rect) { @@ -99,6 +99,14 @@ fn classify_hosts(hosts: &[HostDisplayData]) -> (AlertSeverity, usize, usize, us } fn host_severity(host: &HostDisplayData) -> AlertSeverity { + // Check connection status first + match host.connection_status { + ConnectionStatus::Error => return AlertSeverity::Critical, + ConnectionStatus::Timeout => return AlertSeverity::Warning, + ConnectionStatus::Unknown => return AlertSeverity::Unknown, + ConnectionStatus::Connected => {}, // Continue with other checks + } + if host.last_error.is_some() { return AlertSeverity::Critical; } @@ -120,12 +128,13 @@ fn host_severity(host: &HostDisplayData) -> AlertSeverity { return AlertSeverity::Warning; } - let (perf_severity, _) = evaluate_performance(&services.summary); - match perf_severity { - PerfSeverity::Critical => return AlertSeverity::Critical, - PerfSeverity::Warning => return AlertSeverity::Warning, - PerfSeverity::Ok => {} - } + // TODO: Update to use agent-provided system statuses instead of evaluate_performance + // let (perf_severity, _) = evaluate_performance(&services.summary); + // match perf_severity { + // PerfSeverity::Critical => return AlertSeverity::Critical, + // PerfSeverity::Warning => return AlertSeverity::Warning, + // PerfSeverity::Ok => {} + // } } if let Some(backup) = host.backup.as_ref() { @@ -144,6 +153,30 @@ fn host_severity(host: &HostDisplayData) -> AlertSeverity { } fn host_status(host: &HostDisplayData) -> (String, AlertSeverity, bool) { + // Check connection status first + match host.connection_status { + ConnectionStatus::Error => { + let msg = if let Some(error) = &host.last_error { + format!("Connection error: {}", error) + } else { + "Connection error".to_string() + }; + return (msg, AlertSeverity::Critical, true); + }, + ConnectionStatus::Timeout => { + let msg = if let Some(error) = &host.last_error { + format!("Keep-alive timeout: {}", error) + } else { + "Keep-alive timeout".to_string() + }; + return (msg, AlertSeverity::Warning, true); + }, + ConnectionStatus::Unknown => { + return ("No data received".to_string(), AlertSeverity::Unknown, true); + }, + ConnectionStatus::Connected => {}, // Continue with other checks + } + if let Some(error) = &host.last_error { return (format!("error: {}", error), AlertSeverity::Critical, true); } @@ -177,26 +210,27 @@ fn host_status(host: &HostDisplayData) -> (String, AlertSeverity, bool) { ); } - let (perf_severity, reason) = evaluate_performance(&services.summary); - if let Some(reason_text) = reason { - match perf_severity { - PerfSeverity::Critical => { - return ( - format!("critical: {}", reason_text), - AlertSeverity::Critical, - true, - ); - } - PerfSeverity::Warning => { - return ( - format!("warning: {}", reason_text), - AlertSeverity::Warning, - true, - ); - } - PerfSeverity::Ok => {} - } - } + // TODO: Update to use agent-provided system statuses instead of evaluate_performance + // let (perf_severity, reason) = evaluate_performance(&services.summary); + // if let Some(reason_text) = reason { + // match perf_severity { + // PerfSeverity::Critical => { + // return ( + // format!("critical: {}", reason_text), + // AlertSeverity::Critical, + // true, + // ); + // } + // PerfSeverity::Warning => { + // return ( + // format!("warning: {}", reason_text), + // AlertSeverity::Warning, + // true, + // ); + // } + // PerfSeverity::Ok => {} + // } + // } } if let Some(backup) = host.backup.as_ref() { diff --git a/dashboard/src/ui/backup.rs b/dashboard/src/ui/backup.rs index 23c8702..37035e7 100644 --- a/dashboard/src/ui/backup.rs +++ b/dashboard/src/ui/backup.rs @@ -3,20 +3,32 @@ use ratatui::Frame; use crate::app::HostDisplayData; use crate::data::metrics::BackupMetrics; -use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, WidgetData, WidgetStatus, StatusLevel}; +use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel}; +use crate::app::ConnectionStatus; pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { match host { Some(data) => { - if let Some(metrics) = data.backup.as_ref() { - render_metrics(frame, data, metrics, area); - } else { - render_placeholder( - frame, - area, - "Backups", - &format!("Host {} awaiting backup metrics", data.name), - ); + match (&data.connection_status, data.backup.as_ref()) { + (ConnectionStatus::Connected, Some(metrics)) => { + render_metrics(frame, data, metrics, area); + } + (ConnectionStatus::Connected, None) => { + render_placeholder( + frame, + area, + "Backups", + &format!("Host {} awaiting backup metrics", data.name), + ); + } + (status, _) => { + render_placeholder( + frame, + area, + "Backups", + &format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)), + ); + } } } None => render_placeholder(frame, area, "Backups", "No hosts configured"), diff --git a/dashboard/src/ui/services.rs b/dashboard/src/ui/services.rs index 8acc5ce..dbb4861 100644 --- a/dashboard/src/ui/services.rs +++ b/dashboard/src/ui/services.rs @@ -3,20 +3,32 @@ use ratatui::Frame; use crate::app::HostDisplayData; use crate::data::metrics::ServiceStatus; -use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, WidgetData, WidgetStatus, StatusLevel}; +use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel}; +use crate::app::ConnectionStatus; pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { match host { Some(data) => { - if let Some(metrics) = data.services.as_ref() { - render_metrics(frame, data, metrics, area); - } else { - render_placeholder( - frame, - area, - "Services", - &format!("Host {} has no service metrics yet", data.name), - ); + match (&data.connection_status, data.services.as_ref()) { + (ConnectionStatus::Connected, Some(metrics)) => { + render_metrics(frame, data, metrics, area); + } + (ConnectionStatus::Connected, None) => { + render_placeholder( + frame, + area, + "Services", + &format!("Host {} has no service metrics yet", data.name), + ); + } + (status, _) => { + render_placeholder( + frame, + area, + "Services", + &format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)), + ); + } } } None => render_placeholder(frame, area, "Services", "No hosts configured"), diff --git a/dashboard/src/ui/storage.rs b/dashboard/src/ui/storage.rs index 7c9ed14..b34e402 100644 --- a/dashboard/src/ui/storage.rs +++ b/dashboard/src/ui/storage.rs @@ -3,20 +3,32 @@ use ratatui::Frame; use crate::app::HostDisplayData; use crate::data::metrics::SmartMetrics; -use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, WidgetData, WidgetStatus, StatusLevel}; +use crate::ui::widget::{render_placeholder, render_widget_data, status_level_from_agent_status, connection_status_message, WidgetData, WidgetStatus, StatusLevel}; +use crate::app::ConnectionStatus; pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { match host { Some(data) => { - if let Some(metrics) = data.smart.as_ref() { - render_metrics(frame, data, metrics, area); - } else { - render_placeholder( - frame, - area, - "Storage", - &format!("Host {} has no SMART data yet", data.name), - ); + match (&data.connection_status, data.smart.as_ref()) { + (ConnectionStatus::Connected, Some(metrics)) => { + render_metrics(frame, data, metrics, area); + } + (ConnectionStatus::Connected, None) => { + render_placeholder( + frame, + area, + "Storage", + &format!("Host {} has no SMART data yet", data.name), + ); + } + (status, _) => { + render_placeholder( + frame, + area, + "Storage", + &format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)), + ); + } } } None => render_placeholder(frame, area, "Storage", "No hosts configured"), diff --git a/dashboard/src/ui/system.rs b/dashboard/src/ui/system.rs index 1b43bf4..1de4891 100644 --- a/dashboard/src/ui/system.rs +++ b/dashboard/src/ui/system.rs @@ -2,24 +2,36 @@ use ratatui::layout::Rect; use ratatui::Frame; use crate::app::HostDisplayData; -use crate::data::metrics::{ServiceMetrics, ServiceSummary}; +use crate::data::metrics::SystemMetrics; use crate::ui::widget::{ render_placeholder, render_combined_widget_data, - status_level_from_agent_status, WidgetDataSet, WidgetStatus, StatusLevel, + status_level_from_agent_status, connection_status_message, WidgetDataSet, WidgetStatus, StatusLevel, }; +use crate::app::ConnectionStatus; pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { match host { Some(data) => { - if let Some(metrics) = data.services.as_ref() { - render_metrics(frame, data, metrics, area); - } else { - render_placeholder( - frame, - area, - "System", - &format!("Host {} awaiting service metrics", data.name), - ); + match (&data.connection_status, data.system.as_ref()) { + (ConnectionStatus::Connected, Some(metrics)) => { + render_metrics(frame, data, metrics, area); + } + (ConnectionStatus::Connected, None) => { + render_placeholder( + frame, + area, + "System", + &format!("Host {} awaiting system metrics", data.name), + ); + } + (status, _) => { + render_placeholder( + frame, + area, + "System", + &format!("Host {}: {}", data.name, connection_status_message(status, &data.last_error)), + ); + } } } None => render_placeholder(frame, area, "System", "No hosts configured"), @@ -29,30 +41,12 @@ pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) { fn render_metrics( frame: &mut Frame, _host: &HostDisplayData, - metrics: &ServiceMetrics, + metrics: &SystemMetrics, area: Rect, ) { let summary = &metrics.summary; - let system_total = if summary.system_memory_total_mb > 0.0 { - summary.system_memory_total_mb - } else { - summary.memory_quota_mb - }; - let system_used = if summary.system_memory_used_mb > 0.0 { - summary.system_memory_used_mb - } else { - summary.memory_used_mb - }; - let _usage_ratio = if system_total > 0.0 { - (system_used / system_total) * 100.0 - } else { - 0.0 - }; - - let (perf_severity, _reason) = evaluate_performance(summary); - // Dashboard should NOT calculate border colors - agent is the source of truth - - // Use agent-calculated statuses instead of dashboard calculations + + // Use agent-calculated statuses let memory_status = status_level_from_agent_status(summary.memory_status.as_ref()); let cpu_status = status_level_from_agent_status(summary.cpu_status.as_ref()); // Dashboard should NOT calculate colors - agent is the source of truth @@ -62,7 +56,7 @@ fn render_metrics( memory_dataset.add_row( Some(WidgetStatus::new(memory_status)), vec![], - vec![format!("{:.1} / {:.1} GB", system_used / 1000.0, system_total / 1000.0)], + vec![format!("{:.1} / {:.1} GB", summary.memory_used_mb / 1000.0, summary.memory_total_mb / 1000.0)], ); // CPU dataset - use agent-calculated status @@ -140,30 +134,24 @@ fn render_metrics( ); } - // GPU dataset - // GPU status should come from agent when available - let gpu_status = StatusLevel::Unknown; // Default until agent provides gpu_status + // GPU dataset - GPU data remains in ServiceMetrics, not SystemMetrics + let gpu_status = StatusLevel::Unknown; // GPU not available in SystemMetrics let mut gpu_dataset = WidgetDataSet::new(vec!["GPU load".to_string(), "GPU temp".to_string()], Some(WidgetStatus::new(gpu_status))); gpu_dataset.add_row( Some(WidgetStatus::new(gpu_status)), vec![], vec![ - summary - .gpu_load_percent - .map(|value| format_optional_percent(Some(value))) - .unwrap_or_else(|| "—".to_string()), - summary - .gpu_temp_c - .map(|value| format_optional_metric(Some(value), "°C")) - .unwrap_or_else(|| "—".to_string()), + "—".to_string(), // GPU data not in SystemMetrics + "—".to_string(), // GPU data not in SystemMetrics ], ); - // Determine overall widget status based on worst case - let overall_status_level = match perf_severity { - PerfSeverity::Critical => StatusLevel::Error, - PerfSeverity::Warning => StatusLevel::Warning, - PerfSeverity::Ok => StatusLevel::Ok, + // Determine overall widget status based on worst case from agent statuses + let overall_status_level = match (memory_status, cpu_status) { + (StatusLevel::Error, _) | (_, StatusLevel::Error) => StatusLevel::Error, + (StatusLevel::Warning, _) | (_, StatusLevel::Warning) => StatusLevel::Warning, + (StatusLevel::Ok, StatusLevel::Ok) => StatusLevel::Ok, + _ => StatusLevel::Unknown, }; let overall_status = Some(WidgetStatus::new(overall_status_level)); @@ -171,13 +159,6 @@ fn render_metrics( render_combined_widget_data(frame, area, "System".to_string(), overall_status, vec![memory_dataset, cpu_dataset, cstate_dataset, gpu_dataset]); } -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub(crate) enum PerfSeverity { - Ok, - Warning, - Critical, -} - fn format_optional_metric(value: Option, unit: &str) -> String { match value { Some(number) => format!("{:.1}{}", number, unit), @@ -191,62 +172,3 @@ fn format_optional_percent(value: Option) -> String { None => "—".to_string(), } } - - -pub(crate) fn evaluate_performance(summary: &ServiceSummary) -> (PerfSeverity, Option) { - let mem_percent = if summary.system_memory_total_mb > 0.0 { - (summary.system_memory_used_mb / summary.system_memory_total_mb) * 100.0 - } else if summary.memory_quota_mb > 0.0 { - (summary.memory_used_mb / summary.memory_quota_mb) * 100.0 - } else { - 0.0 - }; - - let mut severity = PerfSeverity::Ok; - let mut reason: Option = None; - - let mut consider = |level: PerfSeverity, message: String| { - if level > severity { - severity = level; - reason = Some(message); - } - }; - - // Use agent's memory status instead of hardcoded thresholds - if let Some(memory_status) = &summary.memory_status { - match memory_status.as_str() { - "critical" => consider(PerfSeverity::Critical, format!("RAM {:.0}%", mem_percent)), - "warning" => consider(PerfSeverity::Warning, format!("RAM {:.0}%", mem_percent)), - _ => {} // "ok" - no alert needed - } - } - - // Use agent's CPU status instead of hardcoded thresholds - if let Some(cpu_status) = &summary.cpu_status { - match cpu_status.as_str() { - "critical" => consider(PerfSeverity::Critical, format!("CPU load {:.2}", summary.cpu_load_5)), - "warning" => consider(PerfSeverity::Warning, format!("CPU load {:.2}", summary.cpu_load_5)), - _ => {} // "ok" - no alert needed - } - } - - // Use agent's CPU temperature status instead of hardcoded thresholds - if let Some(cpu_temp_status) = &summary.cpu_temp_status { - if let Some(temp) = summary.cpu_temp_c { - match cpu_temp_status.as_str() { - "critical" => consider(PerfSeverity::Critical, format!("CPU temp {:.0}°C", temp)), - "warning" => consider(PerfSeverity::Warning, format!("CPU temp {:.0}°C", temp)), - _ => {} // "ok" - no alert needed - } - } - } - - // TODO: GPU status should come from agent, not calculated here with hardcoded thresholds - // For now, remove hardcoded GPU thresholds until agent provides gpu_status - - if severity == PerfSeverity::Ok { - (PerfSeverity::Ok, None) - } else { - (severity, reason) - } -} diff --git a/dashboard/src/ui/widget.rs b/dashboard/src/ui/widget.rs index eefed32..ebb1e5c 100644 --- a/dashboard/src/ui/widget.rs +++ b/dashboard/src/ui/widget.rs @@ -37,6 +37,28 @@ pub fn status_level_from_agent_status(agent_status: Option<&String>) -> StatusLe } } +pub fn connection_status_message(connection_status: &crate::app::ConnectionStatus, last_error: &Option) -> String { + use crate::app::ConnectionStatus; + match connection_status { + ConnectionStatus::Connected => "Connected".to_string(), + ConnectionStatus::Timeout => { + if let Some(error) = last_error { + format!("Timeout: {}", error) + } else { + "Keep-alive timeout".to_string() + } + }, + ConnectionStatus::Error => { + if let Some(error) = last_error { + format!("Error: {}", error) + } else { + "Connection error".to_string() + } + }, + ConnectionStatus::Unknown => "No data received".to_string(), + } +} + pub fn render_placeholder(frame: &mut Frame, area: Rect, title: &str, message: &str) { diff --git a/shared/src/envelope.rs b/shared/src/envelope.rs index 1847e52..2b920b7 100644 --- a/shared/src/envelope.rs +++ b/shared/src/envelope.rs @@ -6,6 +6,7 @@ use serde_json::Value; pub enum AgentType { Smart, Service, + System, Backup, }