Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions
--- a/agent/src/collectors/cpu.rs
+++ b/agent/src/collectors/cpu.rs
@@ -1,5 +1,5 @@
 use async_trait::async_trait;
-use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
+use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};

 use tracing::debug;

@@ -17,41 +17,44 @@ use crate::config::CpuConfig;
 pub struct CpuCollector {
    config: CpuConfig,
    name: String,
+    load_thresholds: HysteresisThresholds,
+    temperature_thresholds: HysteresisThresholds,
 }

 impl CpuCollector {
    pub fn new(config: CpuConfig) -> Self {
+        // Create hysteresis thresholds with 10% gap for recovery
+        let load_thresholds = HysteresisThresholds::new(
+            config.load_warning_threshold,
+            config.load_critical_threshold,
+        );
+        
+        let temperature_thresholds = HysteresisThresholds::new(
+            config.temperature_warning_threshold,
+            config.temperature_critical_threshold,
+        );
+        
        Self {
            config,
            name: "cpu".to_string(),
+            load_thresholds,
+            temperature_thresholds,
        }
    }

-    /// Calculate CPU load status using configured thresholds
-    fn calculate_load_status(&self, load: f32) -> Status {
-        if load >= self.config.load_critical_threshold {
-            Status::Critical
-        } else if load >= self.config.load_warning_threshold {
-            Status::Warning
-        } else {
-            Status::Ok
-        }
+    /// Calculate CPU load status using hysteresis thresholds
+    fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
+        status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
    }

-    /// Calculate CPU temperature status using configured thresholds
-    fn calculate_temperature_status(&self, temp: f32) -> Status {
-        if temp >= self.config.temperature_critical_threshold {
-            Status::Critical
-        } else if temp >= self.config.temperature_warning_threshold {
-            Status::Warning
-        } else {
-            Status::Ok
-        }
+    /// Calculate CPU temperature status using hysteresis thresholds
+    fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
+        status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
    }

    /// Collect CPU load averages from /proc/loadavg
    /// Format: "0.52 0.58 0.59 1/257 12345"
-    async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
+    async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        let content = utils::read_proc_file("/proc/loadavg")?;
        let parts: Vec<&str> = content.trim().split_whitespace().collect();

@@ -68,7 +71,7 @@ impl CpuCollector {

        // Only apply thresholds to 5-minute load average
        let load_1min_status = Status::Ok;  // No alerting on 1min
-        let load_5min_status = self.calculate_load_status(load_5min);  // Only 5min triggers alerts
+        let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker);  // Only 5min triggers alerts
        let load_15min_status = Status::Ok;  // No alerting on 15min

        Ok(vec![
@@ -95,14 +98,14 @@ impl CpuCollector {

    /// Collect CPU temperature from thermal zones
    /// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
-    async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
+    async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
        // Try x86_pkg_temp first (Intel CPU package temperature)
        if let Ok(temp) = self
            .read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
            .await
        {
            let temp_celsius = temp as f32 / 1000.0;
-            let status = self.calculate_temperature_status(temp_celsius);
+            let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);

            return Ok(Some(
                Metric::new(
@@ -120,7 +123,7 @@ impl CpuCollector {
            let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
            if let Ok(temp) = self.read_thermal_zone(&path).await {
                let temp_celsius = temp as f32 / 1000.0;
-                let status = self.calculate_temperature_status(temp_celsius);
+                let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);

                return Ok(Some(
                    Metric::new(
@@ -200,17 +203,17 @@ impl Collector for CpuCollector {
        &self.name
    }

-    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
+    async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
        debug!("Collecting CPU metrics");
        let start = std::time::Instant::now();

        let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency

        // Collect load averages (always available)
-        metrics.extend(self.collect_load_averages().await?);
+        metrics.extend(self.collect_load_averages(status_tracker).await?);

        // Collect temperature (optional)
-        if let Some(temp_metric) = self.collect_temperature().await? {
+        if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
            metrics.push(temp_metric);
        }