Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions
--- a/shared/src/cache.rs
+++ b/shared/src/cache.rs
@@ -24,29 +24,47 @@ pub struct CacheConfig {
 impl Default for CacheConfig {
    fn default() -> Self {
        let mut tiers = HashMap::new();
-        tiers.insert("realtime".to_string(), CacheTier {
-            interval_seconds: 2,
-            description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)".to_string(),
-        });
-        tiers.insert("disk_light".to_string(), CacheTier {
-            interval_seconds: 60,
-            description: "Light disk operations - 1 minute (service status checks)".to_string(),
-        });
-        tiers.insert("disk_medium".to_string(), CacheTier {
-            interval_seconds: 300,
-            description: "Medium disk operations - 5 minutes (disk usage, service disk)".to_string(),
-        });
-        tiers.insert("disk_heavy".to_string(), CacheTier {
-            interval_seconds: 900,
-            description: "Heavy disk operations - 15 minutes (SMART data, backup status)".to_string(),
-        });
-        tiers.insert("static".to_string(), CacheTier {
-            interval_seconds: 3600,
-            description: "Hardware info that rarely changes - 1 hour".to_string(),
-        });
+        tiers.insert(
+            "realtime".to_string(),
+            CacheTier {
+                interval_seconds: 2,
+                description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)"
+                    .to_string(),
+            },
+        );
+        tiers.insert(
+            "disk_light".to_string(),
+            CacheTier {
+                interval_seconds: 10,
+                description: "Light disk operations - 10 seconds (service status checks)".to_string(),
+            },
+        );
+        tiers.insert(
+            "disk_medium".to_string(),
+            CacheTier {
+                interval_seconds: 60,
+                description: "Medium disk operations - 1 minute (disk usage, service disk)"
+                    .to_string(),
+            },
+        );
+        tiers.insert(
+            "disk_heavy".to_string(),
+            CacheTier {
+                interval_seconds: 60,
+                description: "Heavy disk operations - 1 minute (backup status)"
+                    .to_string(),
+            },
+        );
+        tiers.insert(
+            "static".to_string(),
+            CacheTier {
+                interval_seconds: 600,
+                description: "SMART data operations - 10 minutes".to_string(),
+            },
+        );

        let mut metric_assignments = HashMap::new();
-        
+
        // REALTIME (2s) - Memory/CPU operations, no disk I/O
        metric_assignments.insert("cpu_load_*".to_string(), "realtime".to_string());
        metric_assignments.insert("cpu_temperature_*".to_string(), "realtime".to_string());
@@ -55,22 +73,24 @@ impl Default for CacheConfig {
        metric_assignments.insert("service_*_cpu_percent".to_string(), "realtime".to_string());
        metric_assignments.insert("service_*_memory_mb".to_string(), "realtime".to_string());
        metric_assignments.insert("network_*".to_string(), "realtime".to_string());
-        
+
        // DISK_LIGHT (1min) - Light disk operations: service status checks
        metric_assignments.insert("service_*_status".to_string(), "disk_light".to_string());
-        
+
        // DISK_MEDIUM (5min) - Medium disk operations: du commands, disk usage
        metric_assignments.insert("service_*_disk_gb".to_string(), "disk_medium".to_string());
        metric_assignments.insert("disk_tmp_*".to_string(), "disk_medium".to_string());
        metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
        metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
-        
-        // DISK_HEAVY (15min) - Heavy disk operations: SMART data, backup status
-        metric_assignments.insert("disk_*_temperature".to_string(), "disk_heavy".to_string());
-        metric_assignments.insert("disk_*_wear_percent".to_string(), "disk_heavy".to_string());
-        metric_assignments.insert("smart_*".to_string(), "disk_heavy".to_string());
+
+        // DISK_HEAVY (1min) - Heavy disk operations: backup status
        metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());

+        // STATIC (10min) - SMART data operations
+        metric_assignments.insert("disk_*_temperature".to_string(), "static".to_string());
+        metric_assignments.insert("disk_*_wear_percent".to_string(), "static".to_string());
+        metric_assignments.insert("smart_*".to_string(), "static".to_string());
+
        Self {
            enabled: true,
            default_ttl_seconds: 30,
@@ -101,11 +121,11 @@ impl CacheConfig {
        if pattern.contains('*') {
            // Convert pattern to regex-like matching
            let pattern_parts: Vec<&str> = pattern.split('*').collect();
-            
+
            if pattern_parts.len() == 2 {
                let prefix = pattern_parts[0];
                let suffix = pattern_parts[1];
-                
+
                if suffix.is_empty() {
                    // Pattern like "cpu_*" - just check prefix
                    metric_name.starts_with(prefix)
@@ -118,9 +138,9 @@ impl CacheConfig {
                }
            } else {
                // More complex patterns - for now, just check if all parts are present
-                pattern_parts.iter().all(|part| {
-                    part.is_empty() || metric_name.contains(part)
-                })
+                pattern_parts
+                    .iter()
+                    .all(|part| part.is_empty() || metric_name.contains(part))
            }
        } else {
            metric_name == pattern
@@ -142,7 +162,7 @@ mod tests {
    #[test]
    fn test_pattern_matching() {
        let config = CacheConfig::default();
-        
+
        assert!(config.matches_pattern("cpu_load_1min", "cpu_load_*"));
        assert!(config.matches_pattern("service_nginx_disk_gb", "service_*_disk_gb"));
        assert!(!config.matches_pattern("memory_usage_percent", "cpu_load_*"));
@@ -151,21 +171,21 @@ mod tests {
    #[test]
    fn test_tier_assignment() {
        let config = CacheConfig::default();
-        
+
        // Realtime (2s) - CPU/Memory operations
        assert_eq!(config.get_cache_interval("cpu_load_1min"), 2);
        assert_eq!(config.get_cache_interval("memory_usage_percent"), 2);
        assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 2);
-        
-        // Disk light (60s) - Service status
-        assert_eq!(config.get_cache_interval("service_nginx_status"), 60);
-        
-        // Disk medium (300s) - Disk usage  
-        assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 300);
-        assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 300);
-        
-        // Disk heavy (900s) - SMART data
-        assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 900);
-        assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 900);
+
+        // Disk light (10s) - Service status
+        assert_eq!(config.get_cache_interval("service_nginx_status"), 10);
+
+        // Disk medium (60s) - Disk usage
+        assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 60);
+        assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 60);
+
+        // Static (600s) - SMART data
+        assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 600);
+        assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 600);
    }
-}
+}
--- a/shared/src/error.rs
+++ b/shared/src/error.rs
@@ -4,10 +4,10 @@ use thiserror::Error;
 pub enum SharedError {
    #[error("Serialization error: {message}")]
    Serialization { message: String },
-    
+
    #[error("Invalid metric value: {message}")]
    InvalidMetric { message: String },
-    
+
    #[error("Protocol error: {message}")]
    Protocol { message: String },
 }
@@ -18,4 +18,4 @@ impl From<serde_json::Error> for SharedError {
            message: err.to_string(),
        }
    }
-}
+}
--- a/shared/src/lib.rs
+++ b/shared/src/lib.rs
@@ -6,4 +6,4 @@ pub mod protocol;
 pub use cache::*;
 pub use error::*;
 pub use metrics::*;
-pub use protocol::*;
+pub use protocol::*;
--- a/shared/src/metrics.rs
+++ b/shared/src/metrics.rs
@@ -1,5 +1,6 @@
-use serde::{Deserialize, Serialize};
 use chrono::Utc;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;

 /// Individual metric with value, status, and metadata
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -23,12 +24,12 @@ impl Metric {
            unit: None,
        }
    }
-    
+
    pub fn with_description(mut self, description: String) -> Self {
        self.description = Some(description);
        self
    }
-    
+
    pub fn with_unit(mut self, unit: String) -> Self {
        self.unit = Some(unit);
        self
@@ -52,7 +53,7 @@ impl MetricValue {
            _ => None,
        }
    }
-    
+
    pub fn as_i64(&self) -> Option<i64> {
        match self {
            MetricValue::Integer(i) => Some(*i),
@@ -60,7 +61,7 @@ impl MetricValue {
            _ => None,
        }
    }
-    
+
    pub fn as_string(&self) -> String {
        match self {
            MetricValue::String(s) => s.clone(),
@@ -69,7 +70,7 @@ impl MetricValue {
            MetricValue::Boolean(b) => b.to_string(),
        }
    }
-    
+
    pub fn as_bool(&self) -> Option<bool> {
        match self {
            MetricValue::Boolean(b) => Some(*b),
@@ -100,6 +101,118 @@ impl Default for Status {
    }
 }

+/// Hysteresis thresholds for preventing status flapping
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HysteresisThresholds {
+    /// Warning threshold - trigger warning when value >= this
+    pub warning_high: f32,
+    /// Warning recovery - return to ok when value < this
+    pub warning_low: f32,
+    /// Critical threshold - trigger critical when value >= this
+    pub critical_high: f32,
+    /// Critical recovery - return to warning when value < this
+    pub critical_low: f32,
+}
+
+impl HysteresisThresholds {
+    pub fn new(warning_high: f32, critical_high: f32) -> Self {
+        // Default hysteresis: 10% gap for recovery
+        let warning_gap = warning_high * 0.1;
+        let critical_gap = critical_high * 0.1;
+        
+        Self {
+            warning_high,
+            warning_low: warning_high - warning_gap,
+            critical_high,
+            critical_low: critical_high - critical_gap,
+        }
+    }
+
+    pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
+        Self {
+            warning_high,
+            warning_low: warning_high - warning_gap,
+            critical_high,
+            critical_low: critical_high - critical_gap,
+        }
+    }
+
+    /// Calculate status with hysteresis based on current value and previous status
+    pub fn calculate_status(&self, value: f32, previous_status: Status) -> Status {
+        match previous_status {
+            Status::Ok => {
+                if value >= self.critical_high {
+                    Status::Critical
+                } else if value >= self.warning_high {
+                    Status::Warning
+                } else {
+                    Status::Ok
+                }
+            }
+            Status::Warning => {
+                if value >= self.critical_high {
+                    Status::Critical
+                } else if value < self.warning_low {
+                    Status::Ok
+                } else {
+                    Status::Warning
+                }
+            }
+            Status::Critical => {
+                if value < self.critical_low {
+                    if value < self.warning_low {
+                        Status::Ok
+                    } else {
+                        Status::Warning
+                    }
+                } else {
+                    Status::Critical
+                }
+            }
+            Status::Unknown => {
+                // First measurement, use normal thresholds
+                if value >= self.critical_high {
+                    Status::Critical
+                } else if value >= self.warning_high {
+                    Status::Warning
+                } else {
+                    Status::Ok
+                }
+            }
+        }
+    }
+}
+
+/// Status tracker for hysteresis - tracks previous status per metric
+#[derive(Debug, Default)]
+pub struct StatusTracker {
+    previous_statuses: HashMap<String, Status>,
+}
+
+impl StatusTracker {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Get previous status for a metric
+    pub fn get_previous_status(&self, metric_name: &str) -> Status {
+        self.previous_statuses.get(metric_name).copied().unwrap_or(Status::Unknown)
+    }
+
+    /// Update status for a metric
+    pub fn update_status(&mut self, metric_name: String, status: Status) {
+        self.previous_statuses.insert(metric_name, status);
+    }
+
+    /// Calculate status with hysteresis
+    pub fn calculate_with_hysteresis(&mut self, metric_name: &str, value: f32, thresholds: &HysteresisThresholds) -> Status {
+        let previous = self.get_previous_status(metric_name);
+        let new_status = thresholds.calculate_status(value, previous);
+        self.update_status(metric_name.to_string(), new_status);
+        new_status
+    }
+}
+
 /// Metric name registry - constants for all metric names
 pub mod registry {
    // CPU metrics
@@ -109,7 +222,7 @@ pub mod registry {
    pub const CPU_TEMPERATURE_CELSIUS: &str = "cpu_temperature_celsius";
    pub const CPU_FREQUENCY_MHZ: &str = "cpu_frequency_mhz";
    pub const CPU_USAGE_PERCENT: &str = "cpu_usage_percent";
-    
+
    // Memory metrics
    pub const MEMORY_USAGE_PERCENT: &str = "memory_usage_percent";
    pub const MEMORY_TOTAL_GB: &str = "memory_total_gb";
@@ -117,7 +230,7 @@ pub mod registry {
    pub const MEMORY_AVAILABLE_GB: &str = "memory_available_gb";
    pub const MEMORY_SWAP_TOTAL_GB: &str = "memory_swap_total_gb";
    pub const MEMORY_SWAP_USED_GB: &str = "memory_swap_used_gb";
-    
+
    // Disk metrics (template - actual names include device)
    pub const DISK_USAGE_PERCENT_TEMPLATE: &str = "disk_{device}_usage_percent";
    pub const DISK_TEMPERATURE_CELSIUS_TEMPLATE: &str = "disk_{device}_temperature_celsius";
@@ -125,37 +238,37 @@ pub mod registry {
    pub const DISK_SPARE_PERCENT_TEMPLATE: &str = "disk_{device}_spare_percent";
    pub const DISK_HOURS_TEMPLATE: &str = "disk_{device}_hours";
    pub const DISK_CAPACITY_GB_TEMPLATE: &str = "disk_{device}_capacity_gb";
-    
+
    // Service metrics (template - actual names include service)
    pub const SERVICE_STATUS_TEMPLATE: &str = "service_{name}_status";
    pub const SERVICE_MEMORY_MB_TEMPLATE: &str = "service_{name}_memory_mb";
    pub const SERVICE_CPU_PERCENT_TEMPLATE: &str = "service_{name}_cpu_percent";
-    
+
    // Backup metrics
    pub const BACKUP_STATUS: &str = "backup_status";
    pub const BACKUP_LAST_RUN_TIMESTAMP: &str = "backup_last_run_timestamp";
    pub const BACKUP_SIZE_GB: &str = "backup_size_gb";
    pub const BACKUP_DURATION_MINUTES: &str = "backup_duration_minutes";
    pub const BACKUP_NEXT_SCHEDULED_TIMESTAMP: &str = "backup_next_scheduled_timestamp";
-    
+
    // Network metrics (template - actual names include interface)
    pub const NETWORK_RX_BYTES_TEMPLATE: &str = "network_{interface}_rx_bytes";
    pub const NETWORK_TX_BYTES_TEMPLATE: &str = "network_{interface}_tx_bytes";
    pub const NETWORK_RX_PACKETS_TEMPLATE: &str = "network_{interface}_rx_packets";
    pub const NETWORK_TX_PACKETS_TEMPLATE: &str = "network_{interface}_tx_packets";
-    
+
    /// Generate disk metric name from template
    pub fn disk_metric(template: &str, device: &str) -> String {
        template.replace("{device}", device)
    }
-    
+
    /// Generate service metric name from template
    pub fn service_metric(template: &str, name: &str) -> String {
        template.replace("{name}", name)
    }
-    
+
    /// Generate network metric name from template
    pub fn network_metric(template: &str, interface: &str) -> String {
        template.replace("{interface}", interface)
    }
-}
+}
--- a/shared/src/protocol.rs
+++ b/shared/src/protocol.rs
@@ -1,5 +1,5 @@
-use serde::{Deserialize, Serialize};
 use crate::metrics::Metric;
+use serde::{Deserialize, Serialize};

 /// Message sent from agent to dashboard via ZMQ
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -65,28 +65,28 @@ impl MessageEnvelope {
            payload: serde_json::to_vec(&message)?,
        })
    }
-    
+
    pub fn command(command: Command) -> Result<Self, crate::SharedError> {
        Ok(Self {
            message_type: MessageType::Command,
            payload: serde_json::to_vec(&command)?,
        })
    }
-    
+
    pub fn command_response(response: CommandResponse) -> Result<Self, crate::SharedError> {
        Ok(Self {
            message_type: MessageType::CommandResponse,
            payload: serde_json::to_vec(&response)?,
        })
    }
-    
+
    pub fn heartbeat() -> Result<Self, crate::SharedError> {
        Ok(Self {
            message_type: MessageType::Heartbeat,
            payload: Vec::new(),
        })
    }
-    
+
    pub fn decode_metrics(&self) -> Result<MetricMessage, crate::SharedError> {
        match self.message_type {
            MessageType::Metrics => Ok(serde_json::from_slice(&self.payload)?),
@@ -95,7 +95,7 @@ impl MessageEnvelope {
            }),
        }
    }
-    
+
    pub fn decode_command(&self) -> Result<Command, crate::SharedError> {
        match self.message_type {
            MessageType::Command => Ok(serde_json::from_slice(&self.payload)?),
@@ -104,7 +104,7 @@ impl MessageEnvelope {
            }),
        }
    }
-    
+
    pub fn decode_command_response(&self) -> Result<CommandResponse, crate::SharedError> {
        match self.message_type {
            MessageType::CommandResponse => Ok(serde_json::from_slice(&self.payload)?),
@@ -113,4 +113,4 @@ impl MessageEnvelope {
            }),
        }
    }
-}
+}