Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions
--- a/agent/src/agent.rs
+++ b/agent/src/agent.rs
@@ -1,11 +1,11 @@
 use anyhow::Result;
+use gethostname::gethostname;
 use std::time::Duration;
 use tokio::time::interval;
-use tracing::{info, error, debug};
-use gethostname::gethostname;
+use tracing::{debug, error, info};

+use crate::communication::{AgentCommand, ZmqHandler};
 use crate::config::AgentConfig;
-use crate::communication::{ZmqHandler, AgentCommand};
 use crate::metrics::MetricCollectionManager;
 use crate::notifications::NotificationManager;
 use cm_dashboard_shared::{Metric, MetricMessage};
@@ -22,28 +22,31 @@ impl Agent {
    pub async fn new(config_path: Option<String>) -> Result<Self> {
        let hostname = gethostname().to_string_lossy().to_string();
        info!("Initializing agent for host: {}", hostname);
-        
+
        // Load configuration
        let config = if let Some(path) = config_path {
            AgentConfig::load_from_file(&path)?
        } else {
            AgentConfig::default()
        };
-        
+
        info!("Agent configuration loaded");
-        
+
        // Initialize ZMQ communication
        let zmq_handler = ZmqHandler::new(&config.zmq).await?;
-        info!("ZMQ communication initialized on port {}", config.zmq.publisher_port);
-        
+        info!(
+            "ZMQ communication initialized on port {}",
+            config.zmq.publisher_port
+        );
+
        // Initialize metric collection manager with cache config
        let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
        info!("Metric collection manager initialized");
-        
+
        // Initialize notification manager
        let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
        info!("Notification manager initialized");
-        
+
        Ok(Self {
            hostname,
            config,
@@ -52,10 +55,10 @@ impl Agent {
            notification_manager,
        })
    }
-    
+
    pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
        info!("Starting agent main loop with separated collection and transmission");
-        
+
        // CRITICAL: Collect ALL data immediately at startup before entering the loop
        info!("Performing initial FORCE collection of all metrics at startup");
        if let Err(e) = self.collect_all_metrics_force().await {
@@ -63,12 +66,13 @@ impl Agent {
        } else {
            info!("Initial metric collection completed - all data cached and ready");
        }
-        
+
        // Separate intervals for collection and transmission
-        let mut collection_interval = interval(Duration::from_secs(self.config.collection_interval_seconds));
+        let mut collection_interval =
+            interval(Duration::from_secs(self.config.collection_interval_seconds));
        let mut transmission_interval = interval(Duration::from_secs(1)); // ZMQ broadcast every 1 second
        let mut notification_check_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
-        
+
        loop {
            tokio::select! {
                _ = collection_interval.tick() => {
@@ -99,84 +103,93 @@ impl Agent {
                }
            }
        }
-        
+
        info!("Agent main loop stopped");
        Ok(())
    }
-    
+
    async fn collect_all_metrics_force(&mut self) -> Result<()> {
        info!("Starting FORCE metric collection for startup");
-        
+
        // Force collect all metrics from all collectors immediately
        let metrics = self.metric_manager.collect_all_metrics_force().await?;
-        
+
        if metrics.is_empty() {
            error!("No metrics collected during force collection!");
            return Ok(());
        }
-        
+
        info!("Force collected and cached {} metrics", metrics.len());
-        
+
        // Check for status changes and send notifications
        self.check_status_changes(&metrics).await;
-        
+
        Ok(())
    }
-    
+
    async fn collect_metrics_only(&mut self) -> Result<()> {
        debug!("Starting metric collection cycle (cache only)");
-        
+
        // Collect all metrics from all collectors and cache them
        let metrics = self.metric_manager.collect_all_metrics().await?;
-        
+
        if metrics.is_empty() {
            debug!("No metrics collected this cycle");
            return Ok(());
        }
-        
+
        debug!("Collected and cached {} metrics", metrics.len());
-        
+
        // Check for status changes and send notifications
        self.check_status_changes(&metrics).await;
-        
+
        Ok(())
    }
-    
+
    async fn broadcast_all_cached_metrics(&mut self) -> Result<()> {
        debug!("Broadcasting all cached metrics via ZMQ");
-        
+
        // Get all cached metrics from the metric manager
        let cached_metrics = self.metric_manager.get_all_cached_metrics().await?;
-        
+
        if cached_metrics.is_empty() {
            debug!("No cached metrics to broadcast");
            return Ok(());
        }
-        
+
        debug!("Broadcasting {} cached metrics", cached_metrics.len());
-        
+
        // Create and send message with all cached data
        let message = MetricMessage::new(self.hostname.clone(), cached_metrics);
        self.zmq_handler.publish_metrics(&message).await?;
-        
+
        debug!("Cached metrics broadcasted successfully");
        Ok(())
    }
-    
+
    async fn check_status_changes(&mut self, metrics: &[Metric]) {
        for metric in metrics {
-            if let Some(status_change) = self.notification_manager.update_metric_status(&metric.name, metric.status) {
-                info!("Status change detected for {}: {:?} -> {:?}", 
-                      metric.name, status_change.old_status, status_change.new_status);
-                
+            if let Some(status_change) = self
+                .notification_manager
+                .update_metric_status(&metric.name, metric.status)
+            {
+                info!(
+                    "Status change detected for {}: {:?} -> {:?}",
+                    metric.name, status_change.old_status, status_change.new_status
+                );
+
                // Send notification for status change
-                if let Err(e) = self.notification_manager.send_status_change_notification(status_change, metric).await {
+                if let Err(e) = self
+                    .notification_manager
+                    .send_status_change_notification(status_change, metric)
+                    .await
+                {
                    error!("Failed to send notification: {}", e);
                }
            }
        }
    }
-    
+
    async fn handle_commands(&mut self) -> Result<()> {
        // Try to receive commands (non-blocking)
        match self.zmq_handler.try_receive_command() {
@@ -193,7 +206,7 @@ impl Agent {
        }
        Ok(())
    }
-    
+
    async fn process_command(&mut self, command: AgentCommand) -> Result<()> {
        match command {
            AgentCommand::CollectNow => {
@@ -209,7 +222,10 @@ impl Agent {
                info!("Interval change requested but not implemented yet");
            }
            AgentCommand::ToggleCollector { name, enabled } => {
-                info!("Processing ToggleCollector command: {} -> {}", name, enabled);
+                info!(
+                    "Processing ToggleCollector command: {} -> {}",
+                    name, enabled
+                );
                // Note: This would require dynamic collector management
                info!("Collector toggle requested but not implemented yet");
            }
@@ -220,4 +236,4 @@ impl Agent {
        }
        Ok(())
    }
-}
+}
--- a/agent/src/cache/cached_metric.rs
+++ b/agent/src/cache/cached_metric.rs
@@ -8,4 +8,4 @@ pub struct CachedMetric {
    pub collected_at: Instant,
    pub access_count: u64,
    pub tier: Option<CacheTier>,
-}
+}
--- a/agent/src/cache/manager.rs
+++ b/agent/src/cache/manager.rs
@@ -11,10 +11,8 @@ pub struct MetricCacheManager {
 impl MetricCacheManager {
    pub fn new(config: CacheConfig) -> Self {
        let cache = Arc::new(ConfigurableCache::new(config.clone()));
-        
-        Self {
-            cache,
-        }
+
+        Self { cache }
    }

    /// Start background cache management tasks
@@ -32,5 +30,4 @@ impl MetricCacheManager {
    pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
        self.cache.get_all_cached_metrics().await
    }
-
-}
+}
--- a/agent/src/cache/mod.rs
+++ b/agent/src/cache/mod.rs
@@ -4,11 +4,11 @@ use std::time::Instant;
 use tokio::sync::RwLock;
 use tracing::warn;

-mod manager;
 mod cached_metric;
+mod manager;

-pub use manager::MetricCacheManager;
 pub use cached_metric::CachedMetric;
+pub use manager::MetricCacheManager;

 /// Central cache for individual metrics with configurable tiers
 pub struct ConfigurableCache {
@@ -31,7 +31,7 @@ impl ConfigurableCache {
        }

        let mut cache = self.cache.write().await;
-        
+
        // Enforce max entries limit
        if cache.len() >= self.config.max_entries {
            self.cleanup_old_entries(&mut cache).await;
@@ -45,11 +45,10 @@ impl ConfigurableCache {
        };

        cache.insert(metric.name.clone(), cached_metric);
-        
+
        // Cached metric (debug logging disabled for performance)
    }

-    
    /// Get all cached metrics (including expired ones) for broadcasting
    pub async fn get_all_cached_metrics(&self) -> Vec<Metric> {
        if !self.config.enabled {
@@ -58,44 +57,46 @@ impl ConfigurableCache {

        let cache = self.cache.read().await;
        let mut all_metrics = Vec::new();
-        
+
        for cached_metric in cache.values() {
            all_metrics.push(cached_metric.metric.clone());
        }
-        
+
        all_metrics
    }

    /// Background cleanup of old entries
    async fn cleanup_old_entries(&self, cache: &mut HashMap<String, CachedMetric>) {
        let mut to_remove = Vec::new();
-        
+
        for (metric_name, cached_metric) in cache.iter() {
            let cache_interval = self.config.get_cache_interval(metric_name);
            let elapsed = cached_metric.collected_at.elapsed().as_secs();
-            
+
            // Remove entries that are way past their expiration (2x interval)
            if elapsed > cache_interval * 2 {
                to_remove.push(metric_name.clone());
            }
        }
-        
+
        for metric_name in to_remove {
            cache.remove(&metric_name);
        }
-        
+
        // If still too many entries, remove least recently accessed
        if cache.len() >= self.config.max_entries {
-            let mut entries: Vec<_> = cache.iter().map(|(k, v)| (k.clone(), v.access_count)).collect();
+            let mut entries: Vec<_> = cache
+                .iter()
+                .map(|(k, v)| (k.clone(), v.access_count))
+                .collect();
            entries.sort_by_key(|(_, access_count)| *access_count);
-            
+
            let excess = cache.len() - (self.config.max_entries * 3 / 4); // Remove 25%
            for (metric_name, _) in entries.iter().take(excess) {
                cache.remove(metric_name);
            }
-            
+
            warn!("Cache cleanup removed {} entries due to size limit", excess);
        }
    }
-
-}
+}
--- a/agent/src/collectors/backup.rs
+++ b/agent/src/collectors/backup.rs
@@ -1,6 +1,6 @@
 use async_trait::async_trait;
-use cm_dashboard_shared::{Metric, MetricValue, Status};
 use chrono::Utc;
+use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use tokio::fs;
@@ -18,7 +18,8 @@ pub struct BackupCollector {
 impl BackupCollector {
    pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
        Self {
-            backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
+            backup_status_file: backup_status_file
+                .unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
            max_age_hours,
        }
    }
@@ -43,10 +44,16 @@ impl BackupCollector {
            Ok(dt) => dt.with_timezone(&Utc),
            Err(_) => {
                // Try parsing as naive datetime and assume UTC
-                match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
+                match chrono::NaiveDateTime::parse_from_str(
+                    &backup_status.start_time,
+                    "%Y-%m-%dT%H:%M:%S%.f",
+                ) {
                    Ok(naive_dt) => naive_dt.and_utc(),
                    Err(_) => {
-                        error!("Failed to parse backup timestamp: {}", backup_status.start_time);
+                        error!(
+                            "Failed to parse backup timestamp: {}",
+                            backup_status.start_time
+                        );
                        return Status::Unknown;
                    }
                }
@@ -63,7 +70,7 @@ impl BackupCollector {
                } else {
                    Status::Ok
                }
-            },
+            }
            "failed" => Status::Critical,
            "running" => Status::Ok, // Currently running is OK
            _ => Status::Unknown,
@@ -78,7 +85,7 @@ impl BackupCollector {
                } else {
                    Status::Critical
                }
-            },
+            }
            "failed" => Status::Critical,
            "disabled" => Status::Warning, // Service intentionally disabled
            "running" => Status::Ok,
@@ -97,7 +104,7 @@ impl Collector for BackupCollector {
        "backup"
    }

-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
+    async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        let backup_status = self.read_backup_status().await?;
        let mut metrics = Vec::new();
        let timestamp = chrono::Utc::now().timestamp() as u64;
@@ -114,7 +121,10 @@ impl Collector for BackupCollector {
            }),
            status: overall_status,
            timestamp,
-            description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
+            description: Some(format!(
+                "Backup: {} at {}",
+                backup_status.status, backup_status.start_time
+            )),
            unit: None,
        });

@@ -129,14 +139,18 @@ impl Collector for BackupCollector {
        });

        // Last backup timestamp - use last_updated (when backup finished) instead of start_time
-        let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
-            .map(|dt| dt.with_timezone(&Utc))
-            .or_else(|_| {
-                // Try parsing as naive datetime and assume UTC
-                chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
+        let last_updated_dt_result =
+            chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
+                .map(|dt| dt.with_timezone(&Utc))
+                .or_else(|_| {
+                    // Try parsing as naive datetime and assume UTC
+                    chrono::NaiveDateTime::parse_from_str(
+                        &backup_status.last_updated,
+                        "%Y-%m-%dT%H:%M:%S%.f",
+                    )
                    .map(|naive_dt| naive_dt.and_utc())
-            });
-            
+                });
+
        if let Ok(last_updated_dt) = last_updated_dt_result {
            metrics.push(Metric {
                name: "backup_last_run_timestamp".to_string(),
@@ -147,13 +161,16 @@ impl Collector for BackupCollector {
                unit: Some("unix_timestamp".to_string()),
            });
        } else {
-            error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
+            error!(
+                "Failed to parse backup timestamp for last_run_timestamp: {}",
+                backup_status.last_updated
+            );
        }

        // Individual service metrics
        for (service_name, service) in &backup_status.services {
            let service_status = self.calculate_service_status(service);
-            
+
            // Service status
            metrics.push(Metric {
                name: format!("backup_service_{}_status", service_name),
@@ -165,7 +182,10 @@ impl Collector for BackupCollector {
                }),
                status: service_status,
                timestamp,
-                description: Some(format!("Backup service {} status: {}", service_name, service.status)),
+                description: Some(format!(
+                    "Backup service {} status: {}",
+                    service_name, service.status
+                )),
                unit: None,
            });

@@ -173,7 +193,11 @@ impl Collector for BackupCollector {
            metrics.push(Metric {
                name: format!("backup_service_{}_exit_code", service_name),
                value: MetricValue::Integer(service.exit_code),
-                status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
+                status: if service.exit_code == 0 {
+                    Status::Ok
+                } else {
+                    Status::Critical
+                },
                timestamp,
                description: Some(format!("Exit code for backup service {}", service_name)),
                unit: None,
@@ -222,7 +246,9 @@ impl Collector for BackupCollector {
        });

        // Calculate total repository size
-        let total_size_bytes: u64 = backup_status.services.values()
+        let total_size_bytes: u64 = backup_status
+            .services
+            .values()
            .map(|s| s.repo_size_bytes)
            .sum();
        let total_size_gb = Self::bytes_to_gb(total_size_bytes);
@@ -301,7 +327,6 @@ impl Collector for BackupCollector {
                    unit: None,
                });
            }
-
        }

        // Add standalone disk identification metrics from TOML fields
@@ -372,7 +397,7 @@ pub struct DiskSpace {
    pub used_gb: f64,
    pub available_gb: f64,
    pub usage_percent: f64,
-    // Optional disk identification fields  
+    // Optional disk identification fields
    pub product_name: Option<String>,
    pub serial_number: Option<String>,
 }
@@ -384,4 +409,4 @@ pub struct ServiceStatus {
    pub repo_path: String,
    pub archive_count: i64,
    pub repo_size_bytes: u64,
-}
+}
--- a/agent/src/collectors/cpu.rs
+++ b/agent/src/collectors/cpu.rs
@@ -1,5 +1,5 @@
 use async_trait::async_trait;
-use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
+use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};

 use tracing::debug;

@@ -17,41 +17,44 @@ use crate::config::CpuConfig;
 pub struct CpuCollector {
    config: CpuConfig,
    name: String,
+    load_thresholds: HysteresisThresholds,
+    temperature_thresholds: HysteresisThresholds,
 }

 impl CpuCollector {
    pub fn new(config: CpuConfig) -> Self {
+        // Create hysteresis thresholds with 10% gap for recovery
+        let load_thresholds = HysteresisThresholds::new(
+            config.load_warning_threshold,
+            config.load_critical_threshold,
+        );
+        
+        let temperature_thresholds = HysteresisThresholds::new(
+            config.temperature_warning_threshold,
+            config.temperature_critical_threshold,
+        );
+        
        Self {
            config,
            name: "cpu".to_string(),
+            load_thresholds,
+            temperature_thresholds,
        }
    }

-    /// Calculate CPU load status using configured thresholds
-    fn calculate_load_status(&self, load: f32) -> Status {
-        if load >= self.config.load_critical_threshold {
-            Status::Critical
-        } else if load >= self.config.load_warning_threshold {
-            Status::Warning
-        } else {
-            Status::Ok
-        }
+    /// Calculate CPU load status using hysteresis thresholds
+    fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
+        status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
    }

-    /// Calculate CPU temperature status using configured thresholds
-    fn calculate_temperature_status(&self, temp: f32) -> Status {
-        if temp >= self.config.temperature_critical_threshold {
-            Status::Critical
-        } else if temp >= self.config.temperature_warning_threshold {
-            Status::Warning
-        } else {
-            Status::Ok
-        }
+    /// Calculate CPU temperature status using hysteresis thresholds
+    fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
+        status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
    }

    /// Collect CPU load averages from /proc/loadavg
    /// Format: "0.52 0.58 0.59 1/257 12345"
-    async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
+    async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        let content = utils::read_proc_file("/proc/loadavg")?;
        let parts: Vec<&str> = content.trim().split_whitespace().collect();

@@ -68,7 +71,7 @@ impl CpuCollector {

        // Only apply thresholds to 5-minute load average
        let load_1min_status = Status::Ok;  // No alerting on 1min
-        let load_5min_status = self.calculate_load_status(load_5min);  // Only 5min triggers alerts
+        let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker);  // Only 5min triggers alerts
        let load_15min_status = Status::Ok;  // No alerting on 15min

        Ok(vec![
@@ -95,14 +98,14 @@ impl CpuCollector {

    /// Collect CPU temperature from thermal zones
    /// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
-    async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
+    async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
        // Try x86_pkg_temp first (Intel CPU package temperature)
        if let Ok(temp) = self
            .read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
            .await
        {
            let temp_celsius = temp as f32 / 1000.0;
-            let status = self.calculate_temperature_status(temp_celsius);
+            let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);

            return Ok(Some(
                Metric::new(
@@ -120,7 +123,7 @@ impl CpuCollector {
            let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
            if let Ok(temp) = self.read_thermal_zone(&path).await {
                let temp_celsius = temp as f32 / 1000.0;
-                let status = self.calculate_temperature_status(temp_celsius);
+                let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);

                return Ok(Some(
                    Metric::new(
@@ -200,17 +203,17 @@ impl Collector for CpuCollector {
        &self.name
    }

-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
+    async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        debug!("Collecting CPU metrics");
        let start = std::time::Instant::now();

        let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency

        // Collect load averages (always available)
-        metrics.extend(self.collect_load_averages().await?);
+        metrics.extend(self.collect_load_averages(status_tracker).await?);

        // Collect temperature (optional)
-        if let Some(temp_metric) = self.collect_temperature().await? {
+        if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
            metrics.push(temp_metric);
        }

--- a/agent/src/collectors/disk.rs
+++ b/agent/src/collectors/disk.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 use async_trait::async_trait;
-use cm_dashboard_shared::{Metric, MetricValue, Status};
+use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};

 use crate::config::DiskConfig;
 use std::fs;
@@ -28,11 +28,28 @@ struct MountedDisk {
 /// Disk usage collector for monitoring filesystem sizes
 pub struct DiskCollector {
    config: DiskConfig,
+    temperature_thresholds: HysteresisThresholds,
 }

 impl DiskCollector {
    pub fn new(config: DiskConfig) -> Self {
-        Self { config }
+        // Create hysteresis thresholds for disk temperature
+        let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
+            60.0, // warning at 60°C
+            5.0,  // 5°C gap for recovery
+            70.0, // critical at 70°C  
+            5.0,  // 5°C gap for recovery
+        );
+        
+        Self { 
+            config,
+            temperature_thresholds,
+        }
+    }
+
+    /// Calculate disk temperature status using hysteresis thresholds
+    fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
+        status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
    }

    /// Resolve UUID to actual device path
@@ -203,12 +220,6 @@ impl DiskCollector {
        Ok((total_bytes, used_bytes))
    }

-    /// Get root filesystem disk usage
-    fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
-        let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
-        let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
-        Ok((total_bytes, used_bytes, usage_percent as f32))
-    }


    /// Get the physical device for a given device (resolves symlinks, gets parent device)
@@ -339,7 +350,7 @@ impl Collector for DiskCollector {
        "disk"
    }

-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
+    async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        let start_time = Instant::now();
        debug!("Collecting multi-disk metrics");

@@ -497,13 +508,8 @@ impl Collector for DiskCollector {
                    });

                    if temperature > 0.0 {
-                        let temp_status = if temperature >= 70.0 {
-                            Status::Critical
-                        } else if temperature >= 60.0 {
-                            Status::Warning
-                        } else {
-                            Status::Ok
-                        };
+                        let metric_name = format!("disk_smart_{}_temperature", device_name);
+                        let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);

                        metrics.push(Metric {
                            name: format!("disk_smart_{}_temperature", device_name),
--- a/agent/src/collectors/error.rs
+++ b/agent/src/collectors/error.rs
@@ -4,7 +4,7 @@ use thiserror::Error;
 pub enum CollectorError {
    #[error("Failed to read system file {path}: {error}")]
    SystemRead { path: String, error: String },
-    
+
    #[error("Failed to parse value '{value}': {error}")]
    Parse { value: String, error: String },
-}
+}
--- a/agent/src/collectors/memory.rs
+++ b/agent/src/collectors/memory.rs
@@ -1,13 +1,13 @@
 use async_trait::async_trait;
-use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
+use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};

 use tracing::debug;

-use super::{Collector, CollectorError, utils};
+use super::{utils, Collector, CollectorError};
 use crate::config::MemoryConfig;

 /// Extremely efficient memory metrics collector
-/// 
+///
 /// EFFICIENCY OPTIMIZATIONS:
 /// - Single /proc/meminfo read for all memory metrics
 /// - Minimal string parsing with split operations
@@ -17,6 +17,7 @@ use crate::config::MemoryConfig;
 pub struct MemoryCollector {
    config: MemoryConfig,
    name: String,
+    usage_thresholds: HysteresisThresholds,
 }

 /// Memory information parsed from /proc/meminfo
@@ -33,36 +34,38 @@ struct MemoryInfo {

 impl MemoryCollector {
    pub fn new(config: MemoryConfig) -> Self {
+        // Create hysteresis thresholds with 5% gap for memory usage
+        let usage_thresholds = HysteresisThresholds::with_custom_gaps(
+            config.usage_warning_percent,
+            5.0, // 5% gap for warning recovery
+            config.usage_critical_percent,
+            5.0, // 5% gap for critical recovery
+        );
+        
        Self {
            config,
            name: "memory".to_string(),
-            
+            usage_thresholds,
        }
    }
-    
-    /// Calculate memory usage status using configured thresholds
-    fn calculate_usage_status(&self, usage_percent: f32) -> Status {
-        if usage_percent >= self.config.usage_critical_percent {
-            Status::Critical
-        } else if usage_percent >= self.config.usage_warning_percent {
-            Status::Warning
-        } else {
-            Status::Ok
-        }
+
+    /// Calculate memory usage status using hysteresis thresholds
+    fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
+        status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
    }
-    
+
    /// Parse /proc/meminfo efficiently
    /// Format: "MemTotal:       16384000 kB"
    async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
        let content = utils::read_proc_file("/proc/meminfo")?;
        let mut info = MemoryInfo::default();
-        
+
        // Parse each line efficiently - only extract what we need
        for line in content.lines() {
            if let Some(colon_pos) = line.find(':') {
                let key = &line[..colon_pos];
                let value_part = &line[colon_pos + 1..];
-                
+
                // Extract number from value part (format: "    12345 kB")
                if let Some(number_str) = value_part.split_whitespace().next() {
                    if let Ok(value_kb) = utils::parse_u64(number_str) {
@@ -80,7 +83,7 @@ impl MemoryCollector {
                }
            }
        }
-        
+
        // Validate that we got essential fields
        if info.total_kb == 0 {
            return Err(CollectorError::Parse {
@@ -88,87 +91,105 @@ impl MemoryCollector {
                error: "MemTotal not found or zero in /proc/meminfo".to_string(),
            });
        }
-        
+
        // If MemAvailable is not available (older kernels), calculate it
        if info.available_kb == 0 {
            info.available_kb = info.free_kb + info.buffers_kb + info.cached_kb;
        }
-        
+
        Ok(info)
    }
-    
+
    /// Convert KB to GB efficiently (avoiding floating point in hot path)
    fn kb_to_gb(kb: u64) -> f32 {
        kb as f32 / 1_048_576.0 // 1024 * 1024
    }
-    
+
    /// Calculate memory metrics from parsed info
-    fn calculate_metrics(&self, info: &MemoryInfo) -> Vec<Metric> {
+    fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
        let mut metrics = Vec::with_capacity(6);
-        
+
        // Calculate derived values
        let used_kb = info.total_kb - info.available_kb;
        let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
-        let usage_status = self.calculate_usage_status(usage_percent);
-        
+        let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
+
        let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
-        
+
        // Convert to GB for metrics
        let total_gb = Self::kb_to_gb(info.total_kb);
        let used_gb = Self::kb_to_gb(used_kb);
        let available_gb = Self::kb_to_gb(info.available_kb);
        let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
        let swap_used_gb = Self::kb_to_gb(swap_used_kb);
-        
+
        // Memory usage percentage (primary metric with status)
-        metrics.push(Metric::new(
-            registry::MEMORY_USAGE_PERCENT.to_string(),
-            MetricValue::Float(usage_percent),
-            usage_status,
-        ).with_description("Memory usage percentage".to_string())
-        .with_unit("%".to_string()));
-        
+        metrics.push(
+            Metric::new(
+                registry::MEMORY_USAGE_PERCENT.to_string(),
+                MetricValue::Float(usage_percent),
+                usage_status,
+            )
+            .with_description("Memory usage percentage".to_string())
+            .with_unit("%".to_string()),
+        );
+
        // Total memory
-        metrics.push(Metric::new(
-            registry::MEMORY_TOTAL_GB.to_string(),
-            MetricValue::Float(total_gb),
-            Status::Ok, // Total memory doesn't have status
-        ).with_description("Total system memory".to_string())
-        .with_unit("GB".to_string()));
-        
+        metrics.push(
+            Metric::new(
+                registry::MEMORY_TOTAL_GB.to_string(),
+                MetricValue::Float(total_gb),
+                Status::Ok, // Total memory doesn't have status
+            )
+            .with_description("Total system memory".to_string())
+            .with_unit("GB".to_string()),
+        );
+
        // Used memory
-        metrics.push(Metric::new(
-            registry::MEMORY_USED_GB.to_string(),
-            MetricValue::Float(used_gb),
-            Status::Ok, // Used memory absolute value doesn't have status
-        ).with_description("Used system memory".to_string())
-        .with_unit("GB".to_string()));
-        
+        metrics.push(
+            Metric::new(
+                registry::MEMORY_USED_GB.to_string(),
+                MetricValue::Float(used_gb),
+                Status::Ok, // Used memory absolute value doesn't have status
+            )
+            .with_description("Used system memory".to_string())
+            .with_unit("GB".to_string()),
+        );
+
        // Available memory
-        metrics.push(Metric::new(
-            registry::MEMORY_AVAILABLE_GB.to_string(),
-            MetricValue::Float(available_gb),
-            Status::Ok, // Available memory absolute value doesn't have status
-        ).with_description("Available system memory".to_string())
-        .with_unit("GB".to_string()));
-        
+        metrics.push(
+            Metric::new(
+                registry::MEMORY_AVAILABLE_GB.to_string(),
+                MetricValue::Float(available_gb),
+                Status::Ok, // Available memory absolute value doesn't have status
+            )
+            .with_description("Available system memory".to_string())
+            .with_unit("GB".to_string()),
+        );
+
        // Swap metrics (only if swap exists)
        if info.swap_total_kb > 0 {
-            metrics.push(Metric::new(
-                registry::MEMORY_SWAP_TOTAL_GB.to_string(),
-                MetricValue::Float(swap_total_gb),
-                Status::Ok,
-            ).with_description("Total swap space".to_string())
-            .with_unit("GB".to_string()));
-            
-            metrics.push(Metric::new(
-                registry::MEMORY_SWAP_USED_GB.to_string(),
-                MetricValue::Float(swap_used_gb),
-                Status::Ok,
-            ).with_description("Used swap space".to_string())
-            .with_unit("GB".to_string()));
+            metrics.push(
+                Metric::new(
+                    registry::MEMORY_SWAP_TOTAL_GB.to_string(),
+                    MetricValue::Float(swap_total_gb),
+                    Status::Ok,
+                )
+                .with_description("Total swap space".to_string())
+                .with_unit("GB".to_string()),
+            );
+
+            metrics.push(
+                Metric::new(
+                    registry::MEMORY_SWAP_USED_GB.to_string(),
+                    MetricValue::Float(swap_used_gb),
+                    Status::Ok,
+                )
+                .with_description("Used swap space".to_string())
+                .with_unit("GB".to_string()),
+            );
        }
-        
+
        metrics
    }
 }
@@ -178,34 +199,39 @@ impl Collector for MemoryCollector {
    fn name(&self) -> &str {
        &self.name
    }
-    
-    
-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
-        
+
+    async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        debug!("Collecting memory metrics");
        let start = std::time::Instant::now();
-        
+
        // Parse memory info from /proc/meminfo
        let info = self.parse_meminfo().await?;
-        
+
        // Calculate all metrics from parsed info
-        let metrics = self.calculate_metrics(&info);
-        
+        let metrics = self.calculate_metrics(&info, status_tracker);
+
        let duration = start.elapsed();
-        debug!("Memory collection completed in {:?} with {} metrics", duration, metrics.len());
-        
+        debug!(
+            "Memory collection completed in {:?} with {} metrics",
+            duration,
+            metrics.len()
+        );
+
        // Efficiency check: warn if collection takes too long
        if duration.as_millis() > 1 {
-            debug!("Memory collection took {}ms - consider optimization", duration.as_millis());
+            debug!(
+                "Memory collection took {}ms - consider optimization",
+                duration.as_millis()
+            );
        }
-        
+
        // Store performance metrics
        // Performance tracking handled by cache system
-        
+
        Ok(metrics)
    }
-    
+
    fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
        None // Performance tracking handled by cache system
    }
-}
+}
--- a/agent/src/collectors/mod.rs
+++ b/agent/src/collectors/mod.rs
@@ -1,16 +1,7 @@
 use async_trait::async_trait;
-use cm_dashboard_shared::Metric;
+use cm_dashboard_shared::{Metric, StatusTracker};
 use std::time::Duration;

-pub mod cpu;
-pub mod memory;
-pub mod disk;
-pub mod systemd;
-pub mod backup;
-pub mod error;
-
-pub use error::CollectorError;
-
 /// Performance metrics for a collector
 #[derive(Debug, Clone)]
 pub struct PerformanceMetrics {
@@ -18,69 +9,78 @@ pub struct PerformanceMetrics {
    pub collection_efficiency_percent: f32,
 }

+pub mod backup;
+pub mod cpu;
+pub mod disk;
+pub mod error;
+pub mod memory;
+pub mod systemd;
+
+pub use error::CollectorError;
+
+
 /// Base trait for all collectors with extreme efficiency requirements
 #[async_trait]
 pub trait Collector: Send + Sync {
    /// Name of this collector
    fn name(&self) -> &str;
-    
+
    /// Collect all metrics this collector provides
-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError>;
-    
+    async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
+
    /// Get performance metrics for monitoring collector efficiency
    fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
        None
    }
+
 }

 /// CPU efficiency rules for all collectors
 pub mod efficiency {
-    /// CRITICAL: All collectors must follow these efficiency rules to minimize system impact
-    
-    /// 1. FILE READING RULES
-    /// - Read entire files in single syscall when possible
-    /// - Use BufReader only for very large files (>4KB)
-    /// - Never read files character by character
-    /// - Cache file descriptors when safe (immutable paths)
-    
-    /// 2. PARSING RULES  
-    /// - Use split() instead of regex for simple patterns
-    /// - Parse numbers with from_str() not complex parsing
-    /// - Avoid string allocations in hot paths
-    /// - Use str::trim() before parsing numbers
-    
-    /// 3. MEMORY ALLOCATION RULES
-    /// - Reuse Vec buffers when possible
-    /// - Pre-allocate collections with known sizes
-    /// - Use str slices instead of String when possible
-    /// - Avoid clone() in hot paths
-    
-    /// 4. SYSTEM CALL RULES
-    /// - Minimize syscalls - prefer single reads over multiple
-    /// - Use /proc filesystem efficiently
-    /// - Avoid spawning processes when /proc data available
-    /// - Cache static data (like CPU count)
-    
-    /// 5. ERROR HANDLING RULES
-    /// - Use Result<> but minimize allocation in error paths
-    /// - Log errors at debug level only to avoid I/O overhead
-    /// - Graceful degradation - missing metrics better than failing
-    /// - Never panic in collectors
-    
-    /// 6. CONCURRENCY RULES
-    /// - Collectors must be thread-safe but avoid locks
-    /// - Use atomic operations for simple counters
-    /// - Avoid shared mutable state between collections
-    /// - Each collection should be independent
-    
-    pub const PERFORMANCE_TARGET_OVERHEAD_PERCENT: f32 = 0.1;
+    //! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
+    //!
+    //! # FILE READING RULES
+    //! - Read entire files in single syscall when possible
+    //! - Use BufReader only for very large files (>4KB)
+    //! - Never read files character by character
+    //! - Cache file descriptors when safe (immutable paths)
+    //!
+    //! # PARSING RULES  
+    //! - Use split() instead of regex for simple patterns
+    //! - Parse numbers with from_str() not complex parsing
+    //! - Avoid string allocations in hot paths
+    //! - Use str::trim() before parsing numbers
+    //!
+    //! # MEMORY ALLOCATION RULES
+    //! - Reuse Vec buffers when possible
+    //! - Pre-allocate collections with known sizes
+    //! - Use str slices instead of String when possible
+    //! - Avoid clone() in hot paths
+    //!
+    //! # SYSTEM CALL RULES
+    //! - Minimize syscalls - prefer single reads over multiple
+    //! - Use /proc filesystem efficiently
+    //! - Avoid spawning processes when /proc data available
+    //! - Cache static data (like CPU count)
+    //!
+    //! # ERROR HANDLING RULES
+    //! - Use Result<> but minimize allocation in error paths
+    //! - Log errors at debug level only to avoid I/O overhead
+    //! - Graceful degradation - missing metrics better than failing
+    //! - Never panic in collectors
+    //!
+    //! # CONCURRENCY RULES
+    //! - Collectors must be thread-safe but avoid locks
+    //! - Use atomic operations for simple counters
+    //! - Avoid shared mutable state between collections
+    //! - Each collection should be independent
 }

 /// Utility functions for efficient system data collection
 pub mod utils {
-    use std::fs;
    use super::CollectorError;
-    
+    use std::fs;
+
    /// Read entire file content efficiently
    pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
        fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
@@ -88,25 +88,25 @@ pub mod utils {
            error: e.to_string(),
        })
    }
-    
+
    /// Parse float from string slice efficiently  
    pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
-        s.trim().parse().map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
-            value: s.to_string(),
-            error: e.to_string(),
-        })
+        s.trim()
+            .parse()
+            .map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
+                value: s.to_string(),
+                error: e.to_string(),
+            })
    }
-    
+
    /// Parse integer from string slice efficiently
    pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
-        s.trim().parse().map_err(|e: std::num::ParseIntError| CollectorError::Parse {
-            value: s.to_string(),
-            error: e.to_string(),
-        })
+        s.trim()
+            .parse()
+            .map_err(|e: std::num::ParseIntError| CollectorError::Parse {
+                value: s.to_string(),
+                error: e.to_string(),
+            })
    }
-    
-    /// Split string and get nth element safely
-    pub fn split_nth<'a>(s: &'a str, delimiter: char, n: usize) -> Option<&'a str> {
-        s.split(delimiter).nth(n)
-    }
-}
+
+}
--- a/agent/src/collectors/systemd.rs
+++ b/agent/src/collectors/systemd.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 use async_trait::async_trait;
-use cm_dashboard_shared::{Metric, MetricValue, Status};
+use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
 use std::process::Command;
 use std::sync::RwLock;
 use std::time::Instant;
@@ -401,7 +401,7 @@ impl Collector for SystemdCollector {
        "systemd"
    }

-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
+    async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        let start_time = Instant::now();
        debug!("Collecting systemd services metrics");

--- a/agent/src/communication/mod.rs
+++ b/agent/src/communication/mod.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
-use cm_dashboard_shared::{MetricMessage, MessageEnvelope};
-use tracing::{info, debug};
+use cm_dashboard_shared::{MessageEnvelope, MetricMessage};
+use tracing::{debug, info};
 use zmq::{Context, Socket, SocketType};

 use crate::config::ZmqConfig;
@@ -15,75 +15,69 @@ pub struct ZmqHandler {
 impl ZmqHandler {
    pub async fn new(config: &ZmqConfig) -> Result<Self> {
        let context = Context::new();
-        
+
        // Create publisher socket for metrics
        let publisher = context.socket(SocketType::PUB)?;
        let pub_bind_address = format!("tcp://{}:{}", config.bind_address, config.publisher_port);
        publisher.bind(&pub_bind_address)?;
-        
+
        info!("ZMQ publisher bound to {}", pub_bind_address);
-        
+
        // Set socket options for efficiency
        publisher.set_sndhwm(1000)?; // High water mark for outbound messages
        publisher.set_linger(1000)?; // Linger time on close
-        
+
        // Create command receiver socket (PULL socket to receive commands from dashboard)
        let command_receiver = context.socket(SocketType::PULL)?;
        let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port);
        command_receiver.bind(&cmd_bind_address)?;
-        
+
        info!("ZMQ command receiver bound to {}", cmd_bind_address);
-        
+
        // Set non-blocking mode for command receiver
        command_receiver.set_rcvtimeo(0)?; // Non-blocking receive
        command_receiver.set_linger(1000)?;
-        
+
        Ok(Self {
            publisher,
            command_receiver,
            config: config.clone(),
        })
    }
-    
+
    /// Publish metrics message via ZMQ
    pub async fn publish_metrics(&self, message: &MetricMessage) -> Result<()> {
-        debug!("Publishing {} metrics for host {}", message.metrics.len(), message.hostname);
-        
+        debug!(
+            "Publishing {} metrics for host {}",
+            message.metrics.len(),
+            message.hostname
+        );
+
        // Create message envelope
        let envelope = MessageEnvelope::metrics(message.clone())
            .map_err(|e| anyhow::anyhow!("Failed to create message envelope: {}", e))?;
-        
+
        // Serialize envelope
        let serialized = serde_json::to_vec(&envelope)?;
-        
+
        // Send via ZMQ
        self.publisher.send(&serialized, 0)?;
-        
+
        debug!("Published metrics message ({} bytes)", serialized.len());
        Ok(())
    }
-    
+
    /// Send heartbeat (placeholder for future use)
-    pub async fn send_heartbeat(&self) -> Result<()> {
-        let envelope = MessageEnvelope::heartbeat()
-            .map_err(|e| anyhow::anyhow!("Failed to create heartbeat envelope: {}", e))?;
-        
-        let serialized = serde_json::to_vec(&envelope)?;
-        self.publisher.send(&serialized, 0)?;
-        
-        debug!("Sent heartbeat");
-        Ok(())
-    }
-    
+
    /// Try to receive a command (non-blocking)
    pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
        match self.command_receiver.recv_bytes(zmq::DONTWAIT) {
            Ok(bytes) => {
                debug!("Received command message ({} bytes)", bytes.len());
-                
+
                let command: AgentCommand = serde_json::from_slice(&bytes)
                    .map_err(|e| anyhow::anyhow!("Failed to deserialize command: {}", e))?;
-                
+
                debug!("Parsed command: {:?}", command);
                Ok(Some(command))
            }
@@ -107,4 +101,4 @@ pub enum AgentCommand {
    ToggleCollector { name: String, enabled: bool },
    /// Request status/health check
    Ping,
-}
+}
--- a/agent/src/config/loader.rs
+++ b/agent/src/config/loader.rs
@@ -1,18 +1,19 @@
-use anyhow::{Context, Result};
-use std::path::Path;
-use std::fs;
 use crate::config::AgentConfig;
+use anyhow::{Context, Result};
+use std::fs;
+use std::path::Path;

 pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
    let path = path.as_ref();
    let content = fs::read_to_string(path)
        .with_context(|| format!("Failed to read config file: {}", path.display()))?;
-    
+
    let config: AgentConfig = toml::from_str(&content)
        .with_context(|| format!("Failed to parse config file: {}", path.display()))?;
-    
-    config.validate()
+
+    config
+        .validate()
        .with_context(|| format!("Invalid configuration in file: {}", path.display()))?;
-    
+
    Ok(config)
-}
+}
--- a/agent/src/config/mod.rs
+++ b/agent/src/config/mod.rs
@@ -1,6 +1,5 @@
 use anyhow::Result;
 use cm_dashboard_shared::CacheConfig;
-use gethostname::gethostname;
 use serde::{Deserialize, Serialize};
 use std::path::Path;

--- a/agent/src/config/validation.rs
+++ b/agent/src/config/validation.rs
@@ -1,114 +1,126 @@
-use anyhow::{bail, Result};
 use crate::config::AgentConfig;
+use anyhow::{bail, Result};

 pub fn validate_config(config: &AgentConfig) -> Result<()> {
    // Validate ZMQ configuration
    if config.zmq.publisher_port == 0 {
        bail!("ZMQ publisher port cannot be 0");
    }
-    
+
    if config.zmq.command_port == 0 {
        bail!("ZMQ command port cannot be 0");
    }
-    
+
    if config.zmq.publisher_port == config.zmq.command_port {
        bail!("ZMQ publisher and command ports cannot be the same");
    }
-    
+
    if config.zmq.bind_address.is_empty() {
        bail!("ZMQ bind address cannot be empty");
    }
-    
+
    if config.zmq.timeout_ms == 0 {
        bail!("ZMQ timeout cannot be 0");
    }
-    
+
    // Validate collection interval
    if config.collection_interval_seconds == 0 {
        bail!("Collection interval cannot be 0");
    }
-    
+
    // Validate CPU thresholds
    if config.collectors.cpu.enabled {
        if config.collectors.cpu.load_warning_threshold <= 0.0 {
            bail!("CPU load warning threshold must be positive");
        }
-        
-        if config.collectors.cpu.load_critical_threshold <= config.collectors.cpu.load_warning_threshold {
+
+        if config.collectors.cpu.load_critical_threshold
+            <= config.collectors.cpu.load_warning_threshold
+        {
            bail!("CPU load critical threshold must be greater than warning threshold");
        }
-        
+
        if config.collectors.cpu.temperature_warning_threshold <= 0.0 {
            bail!("CPU temperature warning threshold must be positive");
        }
-        
-        if config.collectors.cpu.temperature_critical_threshold <= config.collectors.cpu.temperature_warning_threshold {
+
+        if config.collectors.cpu.temperature_critical_threshold
+            <= config.collectors.cpu.temperature_warning_threshold
+        {
            bail!("CPU temperature critical threshold must be greater than warning threshold");
        }
    }
-    
+
    // Validate memory thresholds
    if config.collectors.memory.enabled {
-        if config.collectors.memory.usage_warning_percent <= 0.0 || config.collectors.memory.usage_warning_percent > 100.0 {
+        if config.collectors.memory.usage_warning_percent <= 0.0
+            || config.collectors.memory.usage_warning_percent > 100.0
+        {
            bail!("Memory usage warning threshold must be between 0 and 100");
        }
-        
-        if config.collectors.memory.usage_critical_percent <= config.collectors.memory.usage_warning_percent 
-            || config.collectors.memory.usage_critical_percent > 100.0 {
+
+        if config.collectors.memory.usage_critical_percent
+            <= config.collectors.memory.usage_warning_percent
+            || config.collectors.memory.usage_critical_percent > 100.0
+        {
            bail!("Memory usage critical threshold must be between warning threshold and 100");
        }
    }
-    
+
    // Validate disk thresholds
    if config.collectors.disk.enabled {
-        if config.collectors.disk.usage_warning_percent <= 0.0 || config.collectors.disk.usage_warning_percent > 100.0 {
+        if config.collectors.disk.usage_warning_percent <= 0.0
+            || config.collectors.disk.usage_warning_percent > 100.0
+        {
            bail!("Disk usage warning threshold must be between 0 and 100");
        }
-        
-        if config.collectors.disk.usage_critical_percent <= config.collectors.disk.usage_warning_percent 
-            || config.collectors.disk.usage_critical_percent > 100.0 {
+
+        if config.collectors.disk.usage_critical_percent
+            <= config.collectors.disk.usage_warning_percent
+            || config.collectors.disk.usage_critical_percent > 100.0
+        {
            bail!("Disk usage critical threshold must be between warning threshold and 100");
        }
    }
-    
+
    // Validate SMTP configuration
    if config.notifications.enabled {
        if config.notifications.smtp_host.is_empty() {
            bail!("SMTP host cannot be empty when notifications are enabled");
        }
-        
+
        if config.notifications.smtp_port == 0 {
            bail!("SMTP port cannot be 0");
        }
-        
+
        if config.notifications.from_email.is_empty() {
            bail!("From email cannot be empty when notifications are enabled");
        }
-        
+
        if config.notifications.to_email.is_empty() {
            bail!("To email cannot be empty when notifications are enabled");
        }
-        
+
        // Basic email validation
        if !config.notifications.from_email.contains('@') {
            bail!("From email must contain @ symbol");
        }
-        
+
        if !config.notifications.to_email.contains('@') {
            bail!("To email must contain @ symbol");
        }
    }
-    
+
    // Validate cache configuration
    if config.cache.enabled {
        if config.cache.default_ttl_seconds == 0 {
            bail!("Cache TTL cannot be 0");
        }
-        
+
        if config.cache.max_entries == 0 {
            bail!("Cache max entries cannot be 0");
        }
    }
-    
+
    Ok(())
-}
+}
--- a/agent/src/main.rs
+++ b/agent/src/main.rs
@@ -1,14 +1,14 @@
 use anyhow::Result;
 use clap::Parser;
-use tracing::{info, error};
+use tracing::{error, info};
 use tracing_subscriber::EnvFilter;

 mod agent;
 mod cache;
-mod config;
-mod communication;
-mod metrics;
 mod collectors;
+mod communication;
+mod config;
+mod metrics;
 mod notifications;
 mod utils;

@@ -22,7 +22,7 @@ struct Cli {
    /// Increase logging verbosity (-v, -vv)
    #[arg(short, long, action = clap::ArgAction::Count)]
    verbose: u8,
-    
+
    /// Configuration file path
    #[arg(short, long)]
    config: Option<String>,
@@ -31,32 +31,32 @@ struct Cli {
 #[tokio::main]
 async fn main() -> Result<()> {
    let cli = Cli::parse();
-    
+
    // Setup logging
    let log_level = match cli.verbose {
        0 => "info",
-        1 => "debug", 
+        1 => "debug",
        _ => "trace",
    };
-    
+
    tracing_subscriber::fmt()
        .with_env_filter(EnvFilter::from_default_env().add_directive(log_level.parse()?))
        .init();
-    
+
    info!("CM Dashboard Agent starting with individual metrics architecture...");
-    
+
    // Create and run agent
    let mut agent = Agent::new(cli.config).await?;
-    
+
    // Setup graceful shutdown channel
    let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
-    
+
    let ctrl_c = async {
        tokio::signal::ctrl_c()
            .await
            .expect("failed to install Ctrl+C handler");
    };
-    
+
    // Run agent with graceful shutdown
    tokio::select! {
        result = agent.run(shutdown_rx) => {
@@ -72,7 +72,7 @@ async fn main() -> Result<()> {
            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
        }
    }
-    
+
    info!("Agent shutdown complete");
    Ok(())
-}
+}
--- a/agent/src/metrics/mod.rs
+++ b/agent/src/metrics/mod.rs
@@ -1,5 +1,5 @@
 use anyhow::Result;
-use cm_dashboard_shared::Metric;
+use cm_dashboard_shared::{Metric, StatusTracker};
 use std::collections::HashMap;
 use std::time::Instant;
 use tracing::{debug, error, info};
@@ -16,6 +16,7 @@ pub struct MetricCollectionManager {
    collectors: Vec<Box<dyn Collector>>,
    cache_manager: MetricCacheManager,
    last_collection_times: HashMap<String, Instant>,
+    status_tracker: StatusTracker,
 }

 impl MetricCollectionManager {
@@ -117,6 +118,7 @@ impl MetricCollectionManager {
            collectors,
            cache_manager,
            last_collection_times: HashMap::new(),
+            status_tracker: StatusTracker::new(),
        })
    }

@@ -134,7 +136,7 @@ impl MetricCollectionManager {
        for collector in &self.collectors {
            let collector_name = collector.name();

-            match collector.collect().await {
+            match collector.collect(&mut self.status_tracker).await {
                Ok(metrics) => {
                    info!(
                        "Force collected {} metrics from {} collector",
@@ -200,7 +202,7 @@ impl MetricCollectionManager {

            if should_collect {
                collecting_fresh.insert(collector_name.to_string());
-                match collector.collect().await {
+                match collector.collect(&mut self.status_tracker).await {
                    Ok(metrics) => {
                        // Collector returned fresh metrics (debug logging disabled for performance)

--- a/agent/src/utils/mod.rs
+++ b/agent/src/utils/mod.rs
@@ -3,41 +3,42 @@
 /// System information utilities
 pub mod system {
    use std::fs;
-    
+
    /// Get number of CPU cores efficiently
    pub fn get_cpu_count() -> Result<usize, std::io::Error> {
        // Try /proc/cpuinfo first (most reliable)
        if let Ok(content) = fs::read_to_string("/proc/cpuinfo") {
-            let count = content.lines()
+            let count = content
+                .lines()
                .filter(|line| line.starts_with("processor"))
                .count();
-            
+
            if count > 0 {
                return Ok(count);
            }
        }
-        
+
        // Fallback to nproc equivalent
        match std::thread::available_parallelism() {
            Ok(count) => Ok(count.get()),
            Err(_) => Ok(1), // Default to 1 core if all else fails
        }
    }
-    
+
    /// Check if running in container
    pub fn is_container() -> bool {
        // Check for common container indicators
-        fs::metadata("/.dockerenv").is_ok() ||
-        fs::read_to_string("/proc/1/cgroup")
-            .map(|content| content.contains("docker") || content.contains("containerd"))
-            .unwrap_or(false)
+        fs::metadata("/.dockerenv").is_ok()
+            || fs::read_to_string("/proc/1/cgroup")
+                .map(|content| content.contains("docker") || content.contains("containerd"))
+                .unwrap_or(false)
    }
 }

 /// Time utilities
 pub mod time {
    use std::time::{Duration, Instant};
-    
+
    /// Measure execution time of a closure
    pub fn measure_time<F, R>(f: F) -> (R, Duration)
    where
@@ -54,14 +55,14 @@ pub mod time {
 pub mod perf {
    use std::time::{Duration, Instant};
    use tracing::warn;
-    
+
    /// Performance monitor for critical operations
    pub struct PerfMonitor {
        operation: String,
        start: Instant,
        warning_threshold: Duration,
    }
-    
+
    impl PerfMonitor {
        pub fn new(operation: &str, warning_threshold: Duration) -> Self {
            Self {
@@ -70,12 +71,12 @@ pub mod perf {
                warning_threshold,
            }
        }
-        
+
        pub fn new_ms(operation: &str, warning_threshold_ms: u64) -> Self {
            Self::new(operation, Duration::from_millis(warning_threshold_ms))
        }
    }
-    
+
    impl Drop for PerfMonitor {
        fn drop(&mut self) {
            let elapsed = self.start.elapsed();
@@ -87,4 +88,4 @@ pub mod perf {
            }
        }
    }
-}
+}