diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 3cb1c8e..262d1ea 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.140" +version = "0.1.141" edition = "2021" [dependencies] diff --git a/agent/src/agent.rs b/agent/src/agent.rs index 24b236c..a968e94 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -26,6 +26,16 @@ pub struct Agent { collectors: Vec>, notification_manager: NotificationManager, service_tracker: UserStoppedServiceTracker, + previous_status: Option, +} + +/// Track system component status for change detection +#[derive(Debug, Clone)] +struct SystemStatus { + cpu_load_status: cm_dashboard_shared::Status, + cpu_temperature_status: cm_dashboard_shared::Status, + memory_usage_status: cm_dashboard_shared::Status, + // Add more as needed } impl Agent { @@ -91,6 +101,7 @@ impl Agent { collectors, notification_manager, service_tracker, + previous_status: None, }) } @@ -157,6 +168,11 @@ impl Agent { } } + // Check for status changes and send notifications + if let Err(e) = self.check_status_changes_and_notify(&agent_data).await { + error!("Failed to check status changes: {}", e); + } + // Broadcast the structured data via ZMQ if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { error!("Failed to broadcast agent data: {}", e); @@ -167,6 +183,84 @@ impl Agent { Ok(()) } + /// Check for status changes and send notifications + async fn check_status_changes_and_notify(&mut self, agent_data: &AgentData) -> Result<()> { + // Extract current status + let current_status = SystemStatus { + cpu_load_status: agent_data.system.cpu.load_status.clone(), + cpu_temperature_status: agent_data.system.cpu.temperature_status.clone(), + memory_usage_status: agent_data.system.memory.usage_status.clone(), + }; + + // Check for status changes + if let Some(previous) = self.previous_status.clone() { + self.check_and_notify_status_change( + "CPU Load", + &previous.cpu_load_status, + ¤t_status.cpu_load_status, + format!("CPU load: {:.1}", agent_data.system.cpu.load_1min) + ).await?; + + self.check_and_notify_status_change( + "CPU Temperature", + &previous.cpu_temperature_status, + ¤t_status.cpu_temperature_status, + format!("CPU temperature: {}°C", + agent_data.system.cpu.temperature_celsius.unwrap_or(0.0) as i32) + ).await?; + + self.check_and_notify_status_change( + "Memory Usage", + &previous.memory_usage_status, + ¤t_status.memory_usage_status, + format!("Memory usage: {:.1}%", agent_data.system.memory.usage_percent) + ).await?; + } + + // Store current status for next comparison + self.previous_status = Some(current_status); + Ok(()) + } + + /// Check individual status change and send notification if degraded + async fn check_and_notify_status_change( + &mut self, + component: &str, + previous: &cm_dashboard_shared::Status, + current: &cm_dashboard_shared::Status, + details: String + ) -> Result<()> { + use cm_dashboard_shared::Status; + + // Only notify on status degradation (OK → Warning/Critical, Warning → Critical) + let should_notify = match (previous, current) { + (Status::Ok, Status::Warning) => true, + (Status::Ok, Status::Critical) => true, + (Status::Warning, Status::Critical) => true, + _ => false, + }; + + if should_notify { + let subject = format!("{} {} Alert", self.hostname, component); + let body = format!( + "Alert: {} status changed from {:?} to {:?}\n\nDetails: {}\n\nTime: {}", + component, + previous, + current, + details, + chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC") + ); + + info!("Sending notification: {} - {:?} → {:?}", component, previous, current); + + if let Err(e) = self.notification_manager.send_direct_email(&subject, &body).await { + error!("Failed to send notification for {}: {}", component, e); + } + } + + Ok(()) + } + /// Handle incoming commands from dashboard async fn handle_commands(&mut self) -> Result<()> { // Try to receive a command (non-blocking) diff --git a/agent/src/collectors/cpu.rs b/agent/src/collectors/cpu.rs index 06340d9..6c870de 100644 --- a/agent/src/collectors/cpu.rs +++ b/agent/src/collectors/cpu.rs @@ -179,6 +179,14 @@ impl Collector for CpuCollector { ); } + // Calculate status using thresholds + agent_data.system.cpu.load_status = self.calculate_load_status(agent_data.system.cpu.load_1min); + agent_data.system.cpu.temperature_status = if let Some(temp) = agent_data.system.cpu.temperature_celsius { + self.calculate_temperature_status(temp) + } else { + Status::Unknown + }; + Ok(()) } } diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index 7d8af82..267b7fb 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -1,6 +1,6 @@ use anyhow::Result; use async_trait::async_trait; -use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds}; +use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status}; use crate::config::DiskConfig; use std::process::Command; @@ -418,6 +418,7 @@ impl DiskCollector { usage_percent: fs.usage_percent, used_gb: fs.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0), total_gb: fs.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0), + usage_status: self.calculate_filesystem_usage_status(fs.usage_percent), } }).collect(); @@ -430,6 +431,12 @@ impl DiskCollector { temperature_celsius: smart.and_then(|s| s.temperature_celsius), wear_percent: smart.and_then(|s| s.wear_percent), filesystems, + temperature_status: smart.and_then(|s| s.temperature_celsius) + .map(|temp| self.calculate_temperature_status(temp)) + .unwrap_or(Status::Unknown), + health_status: self.calculate_health_status( + smart.map(|s| s.health.as_str()).unwrap_or("UNKNOWN") + ), }); } @@ -466,6 +473,32 @@ impl DiskCollector { Ok(()) } + + /// Calculate filesystem usage status + fn calculate_filesystem_usage_status(&self, usage_percent: f32) -> Status { + // Use standard filesystem warning/critical thresholds + if usage_percent >= 95.0 { + Status::Critical + } else if usage_percent >= 85.0 { + Status::Warning + } else { + Status::Ok + } + } + + /// Calculate drive temperature status + fn calculate_temperature_status(&self, temperature: f32) -> Status { + self.temperature_thresholds.evaluate(temperature) + } + + /// Calculate drive health status + fn calculate_health_status(&self, health: &str) -> Status { + match health { + "PASSED" => Status::Ok, + "FAILED" => Status::Critical, + _ => Status::Unknown, + } + } } #[async_trait] diff --git a/agent/src/collectors/memory.rs b/agent/src/collectors/memory.rs index 36ea0c5..61d0d87 100644 --- a/agent/src/collectors/memory.rs +++ b/agent/src/collectors/memory.rs @@ -1,5 +1,5 @@ use async_trait::async_trait; -use cm_dashboard_shared::{AgentData, TmpfsData, HysteresisThresholds}; +use cm_dashboard_shared::{AgentData, TmpfsData, HysteresisThresholds, Status}; use tracing::debug; @@ -187,6 +187,11 @@ impl MemoryCollector { "/tmp" | "/var/tmp" | "/dev/shm" | "/run" | "/var/log" ) || mount_point.starts_with("/run/user/") // User session tmpfs } + + /// Calculate memory usage status based on thresholds + fn calculate_memory_status(&self, usage_percent: f32) -> Status { + self.usage_thresholds.evaluate(usage_percent) + } } #[async_trait] @@ -215,6 +220,9 @@ impl Collector for MemoryCollector { ); } + // Calculate status using thresholds + agent_data.system.memory.usage_status = self.calculate_memory_status(agent_data.system.memory.usage_percent); + Ok(()) } } diff --git a/agent/src/collectors/nixos.rs b/agent/src/collectors/nixos.rs index 6286b9d..aef3729 100644 --- a/agent/src/collectors/nixos.rs +++ b/agent/src/collectors/nixos.rs @@ -32,6 +32,9 @@ impl NixOSCollector { // Set agent version from environment or Nix store path agent_data.agent_version = self.get_agent_version().await; + // Set NixOS build/generation information + agent_data.build_version = self.get_nixos_generation().await; + // Set current timestamp agent_data.timestamp = chrono::Utc::now().timestamp() as u64; diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 2b5cb41..52e552c 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.140" +version = "0.1.141" edition = "2021" [dependencies] diff --git a/dashboard/src/ui/widgets/system.rs b/dashboard/src/ui/widgets/system.rs index 942f480..abf74cb 100644 --- a/dashboard/src/ui/widgets/system.rs +++ b/dashboard/src/ui/widgets/system.rs @@ -138,6 +138,9 @@ impl Widget for SystemWidget { // Extract agent version self.agent_hash = Some(agent_data.agent_version.clone()); + + // Extract build version + self.nixos_build = agent_data.build_version.clone(); // Extract CPU data directly let cpu = &agent_data.system.cpu; diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 289fe75..1a279a5 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.140" +version = "0.1.141" edition = "2021" [dependencies] diff --git a/shared/src/agent_data.rs b/shared/src/agent_data.rs index 2707900..d5f5236 100644 --- a/shared/src/agent_data.rs +++ b/shared/src/agent_data.rs @@ -1,10 +1,12 @@ use serde::{Deserialize, Serialize}; +use crate::Status; /// Complete structured data from an agent #[derive(Debug, Clone, Serialize, Deserialize)] pub struct AgentData { pub hostname: String, pub agent_version: String, + pub build_version: Option, pub timestamp: u64, pub system: SystemData, pub services: Vec, @@ -27,6 +29,8 @@ pub struct CpuData { pub load_15min: f32, pub frequency_mhz: f32, pub temperature_celsius: Option, + pub load_status: Status, + pub temperature_status: Status, } /// Memory monitoring data @@ -39,6 +43,7 @@ pub struct MemoryData { pub swap_total_gb: f32, pub swap_used_gb: f32, pub tmpfs: Vec, + pub usage_status: Status, } /// Tmpfs filesystem data @@ -65,6 +70,8 @@ pub struct DriveData { pub temperature_celsius: Option, pub wear_percent: Option, pub filesystems: Vec, + pub temperature_status: Status, + pub health_status: Status, } /// Filesystem on a drive @@ -74,6 +81,7 @@ pub struct FilesystemData { pub usage_percent: f32, pub used_gb: f32, pub total_gb: f32, + pub usage_status: Status, } /// Storage pool (MergerFS, RAID, etc.) @@ -125,6 +133,7 @@ impl AgentData { Self { hostname, agent_version, + build_version: None, timestamp: chrono::Utc::now().timestamp() as u64, system: SystemData { cpu: CpuData { @@ -133,6 +142,8 @@ impl AgentData { load_15min: 0.0, frequency_mhz: 0.0, temperature_celsius: None, + load_status: Status::Unknown, + temperature_status: Status::Unknown, }, memory: MemoryData { usage_percent: 0.0, @@ -142,6 +153,7 @@ impl AgentData { swap_total_gb: 0.0, swap_used_gb: 0.0, tmpfs: Vec::new(), + usage_status: Status::Unknown, }, storage: StorageData { drives: Vec::new(), diff --git a/shared/src/metrics.rs b/shared/src/metrics.rs index 1df9bce..913344f 100644 --- a/shared/src/metrics.rs +++ b/shared/src/metrics.rs @@ -131,6 +131,17 @@ impl HysteresisThresholds { } } + /// Evaluate value against thresholds to determine status + pub fn evaluate(&self, value: f32) -> Status { + if value >= self.critical_high { + Status::Critical + } else if value >= self.warning_high { + Status::Warning + } else { + Status::Ok + } + } + pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self { Self { warning_high,