Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions
--- a/agent/src/collectors/mod.rs
+++ b/agent/src/collectors/mod.rs
@@ -1,16 +1,7 @@
 use async_trait::async_trait;
-use cm_dashboard_shared::Metric;
+use cm_dashboard_shared::{Metric, StatusTracker};
 use std::time::Duration;

-pub mod cpu;
-pub mod memory;
-pub mod disk;
-pub mod systemd;
-pub mod backup;
-pub mod error;
-
-pub use error::CollectorError;
-
 /// Performance metrics for a collector
 #[derive(Debug, Clone)]
 pub struct PerformanceMetrics {
@@ -18,69 +9,78 @@ pub struct PerformanceMetrics {
    pub collection_efficiency_percent: f32,
 }

+pub mod backup;
+pub mod cpu;
+pub mod disk;
+pub mod error;
+pub mod memory;
+pub mod systemd;
+
+pub use error::CollectorError;
+
+
 /// Base trait for all collectors with extreme efficiency requirements
 #[async_trait]
 pub trait Collector: Send + Sync {
    /// Name of this collector
    fn name(&self) -> &str;
-    
+
    /// Collect all metrics this collector provides
-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError>;
-    
+    async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
+
    /// Get performance metrics for monitoring collector efficiency
    fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
        None
    }
+
 }

 /// CPU efficiency rules for all collectors
 pub mod efficiency {
-    /// CRITICAL: All collectors must follow these efficiency rules to minimize system impact
-    
-    /// 1. FILE READING RULES
-    /// - Read entire files in single syscall when possible
-    /// - Use BufReader only for very large files (>4KB)
-    /// - Never read files character by character
-    /// - Cache file descriptors when safe (immutable paths)
-    
-    /// 2. PARSING RULES  
-    /// - Use split() instead of regex for simple patterns
-    /// - Parse numbers with from_str() not complex parsing
-    /// - Avoid string allocations in hot paths
-    /// - Use str::trim() before parsing numbers
-    
-    /// 3. MEMORY ALLOCATION RULES
-    /// - Reuse Vec buffers when possible
-    /// - Pre-allocate collections with known sizes
-    /// - Use str slices instead of String when possible
-    /// - Avoid clone() in hot paths
-    
-    /// 4. SYSTEM CALL RULES
-    /// - Minimize syscalls - prefer single reads over multiple
-    /// - Use /proc filesystem efficiently
-    /// - Avoid spawning processes when /proc data available
-    /// - Cache static data (like CPU count)
-    
-    /// 5. ERROR HANDLING RULES
-    /// - Use Result<> but minimize allocation in error paths
-    /// - Log errors at debug level only to avoid I/O overhead
-    /// - Graceful degradation - missing metrics better than failing
-    /// - Never panic in collectors
-    
-    /// 6. CONCURRENCY RULES
-    /// - Collectors must be thread-safe but avoid locks
-    /// - Use atomic operations for simple counters
-    /// - Avoid shared mutable state between collections
-    /// - Each collection should be independent
-    
-    pub const PERFORMANCE_TARGET_OVERHEAD_PERCENT: f32 = 0.1;
+    //! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
+    //!
+    //! # FILE READING RULES
+    //! - Read entire files in single syscall when possible
+    //! - Use BufReader only for very large files (>4KB)
+    //! - Never read files character by character
+    //! - Cache file descriptors when safe (immutable paths)
+    //!
+    //! # PARSING RULES  
+    //! - Use split() instead of regex for simple patterns
+    //! - Parse numbers with from_str() not complex parsing
+    //! - Avoid string allocations in hot paths
+    //! - Use str::trim() before parsing numbers
+    //!
+    //! # MEMORY ALLOCATION RULES
+    //! - Reuse Vec buffers when possible
+    //! - Pre-allocate collections with known sizes
+    //! - Use str slices instead of String when possible
+    //! - Avoid clone() in hot paths
+    //!
+    //! # SYSTEM CALL RULES
+    //! - Minimize syscalls - prefer single reads over multiple
+    //! - Use /proc filesystem efficiently
+    //! - Avoid spawning processes when /proc data available
+    //! - Cache static data (like CPU count)
+    //!
+    //! # ERROR HANDLING RULES
+    //! - Use Result<> but minimize allocation in error paths
+    //! - Log errors at debug level only to avoid I/O overhead
+    //! - Graceful degradation - missing metrics better than failing
+    //! - Never panic in collectors
+    //!
+    //! # CONCURRENCY RULES
+    //! - Collectors must be thread-safe but avoid locks
+    //! - Use atomic operations for simple counters
+    //! - Avoid shared mutable state between collections
+    //! - Each collection should be independent
 }

 /// Utility functions for efficient system data collection
 pub mod utils {
-    use std::fs;
    use super::CollectorError;
-    
+    use std::fs;
+
    /// Read entire file content efficiently
    pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
        fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
@@ -88,25 +88,25 @@ pub mod utils {
            error: e.to_string(),
        })
    }
-    
+
    /// Parse float from string slice efficiently  
    pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
-        s.trim().parse().map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
-            value: s.to_string(),
-            error: e.to_string(),
-        })
+        s.trim()
+            .parse()
+            .map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
+                value: s.to_string(),
+                error: e.to_string(),
+            })
    }
-    
+
    /// Parse integer from string slice efficiently
    pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
-        s.trim().parse().map_err(|e: std::num::ParseIntError| CollectorError::Parse {
-            value: s.to_string(),
-            error: e.to_string(),
-        })
+        s.trim()
+            .parse()
+            .map_err(|e: std::num::ParseIntError| CollectorError::Parse {
+                value: s.to_string(),
+                error: e.to_string(),
+            })
    }
-    
-    /// Split string and get nth element safely
-    pub fn split_nth<'a>(s: &'a str, delimiter: char, n: usize) -> Option<&'a str> {
-        s.split(delimiter).nth(n)
-    }
-}
+
+}