Fix CPU load alerting to only trigger on 5-minute load average

Only the 5-minute load average should trigger warning/critical alerts. 1-minute and 15-minute load averages now always show Status::Ok. Thresholds (Warning: 9.0, Critical: 10.0) apply only to cpu_load_5min metric.
2025-10-20 11:12:15 +02:00 · 2025-10-20 11:12:15 +02:00 · 28896d0b1b
commit 28896d0b1b
parent 47a7d5ae62
1 changed files with 96 additions and 73 deletions
--- a/agent/src/collectors/cpu.rs
+++ b/agent/src/collectors/cpu.rs
@ -1,13 +1,13 @@
 use async_trait::async_trait;
-use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
+use cm_dashboard_shared::{registry, Metric, MetricValue, Status};

 use tracing::debug;

-use super::{Collector, CollectorError, utils};
+use super::{utils, Collector, CollectorError};
 use crate::config::CpuConfig;

 /// Extremely efficient CPU metrics collector
-/// 
+///
 /// EFFICIENCY OPTIMIZATIONS:
 /// - Single /proc/loadavg read for all load metrics
 /// - Single /proc/stat read for CPU usage
@ -26,7 +26,7 @@ impl CpuCollector {
            name: "cpu".to_string(),
        }
    }
-    
+
    /// Calculate CPU load status using configured thresholds
    fn calculate_load_status(&self, load: f32) -> Status {
        if load >= self.config.load_critical_threshold {
@ -37,7 +37,7 @@ impl CpuCollector {
            Status::Ok
        }
    }
-    
+
    /// Calculate CPU temperature status using configured thresholds
    fn calculate_temperature_status(&self, temp: f32) -> Status {
        if temp >= self.config.temperature_critical_threshold {
@ -48,132 +48,150 @@ impl CpuCollector {
            Status::Ok
        }
    }
-    
+
    /// Collect CPU load averages from /proc/loadavg
    /// Format: "0.52 0.58 0.59 1/257 12345"
    async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
        let content = utils::read_proc_file("/proc/loadavg")?;
        let parts: Vec<&str> = content.trim().split_whitespace().collect();
-        
+
        if parts.len() < 3 {
            return Err(CollectorError::Parse {
                value: content,
                error: "Expected at least 3 values in /proc/loadavg".to_string(),
            });
        }
-        
+
        let load_1min = utils::parse_f32(parts[0])?;
        let load_5min = utils::parse_f32(parts[1])?;
        let load_15min = utils::parse_f32(parts[2])?;
-        
-        // Calculate status for each load average (use 1min for primary status)
-        let load_1min_status = self.calculate_load_status(load_1min);
-        let load_5min_status = self.calculate_load_status(load_5min);
-        let load_15min_status = self.calculate_load_status(load_15min);
-        
+
+        // Only apply thresholds to 5-minute load average
+        let load_1min_status = Status::Ok;  // No alerting on 1min
+        let load_5min_status = self.calculate_load_status(load_5min);  // Only 5min triggers alerts
+        let load_15min_status = Status::Ok;  // No alerting on 15min
+
        Ok(vec![
            Metric::new(
                registry::CPU_LOAD_1MIN.to_string(),
                MetricValue::Float(load_1min),
                load_1min_status,
-            ).with_description("CPU load average over 1 minute".to_string()),
-            
+            )
+            .with_description("CPU load average over 1 minute".to_string()),
            Metric::new(
                registry::CPU_LOAD_5MIN.to_string(),
                MetricValue::Float(load_5min),
                load_5min_status,
-            ).with_description("CPU load average over 5 minutes".to_string()),
-            
+            )
+            .with_description("CPU load average over 5 minutes".to_string()),
            Metric::new(
                registry::CPU_LOAD_15MIN.to_string(),
                MetricValue::Float(load_15min),
                load_15min_status,
-            ).with_description("CPU load average over 15 minutes".to_string()),
+            )
+            .with_description("CPU load average over 15 minutes".to_string()),
        ])
    }
-    
+
    /// Collect CPU temperature from thermal zones
    /// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
    async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
        // Try x86_pkg_temp first (Intel CPU package temperature)
-        if let Ok(temp) = self.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp").await {
+        if let Ok(temp) = self
+            .read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
+            .await
+        {
            let temp_celsius = temp as f32 / 1000.0;
            let status = self.calculate_temperature_status(temp_celsius);
-            
-            return Ok(Some(Metric::new(
-                registry::CPU_TEMPERATURE_CELSIUS.to_string(),
-                MetricValue::Float(temp_celsius),
-                status,
-            ).with_description("CPU package temperature".to_string())
-            .with_unit("°C".to_string())));
+
+            return Ok(Some(
+                Metric::new(
+                    registry::CPU_TEMPERATURE_CELSIUS.to_string(),
+                    MetricValue::Float(temp_celsius),
+                    status,
+                )
+                .with_description("CPU package temperature".to_string())
+                .with_unit("°C".to_string()),
+            ));
        }
-        
+
        // Fallback: try other thermal zones
        for zone_id in 0..10 {
            let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
            if let Ok(temp) = self.read_thermal_zone(&path).await {
                let temp_celsius = temp as f32 / 1000.0;
                let status = self.calculate_temperature_status(temp_celsius);
-                
-                return Ok(Some(Metric::new(
-                    registry::CPU_TEMPERATURE_CELSIUS.to_string(),
-                    MetricValue::Float(temp_celsius),
-                    status,
-                ).with_description(format!("CPU temperature from thermal_zone{}", zone_id))
-                .with_unit("°C".to_string())));
+
+                return Ok(Some(
+                    Metric::new(
+                        registry::CPU_TEMPERATURE_CELSIUS.to_string(),
+                        MetricValue::Float(temp_celsius),
+                        status,
+                    )
+                    .with_description(format!("CPU temperature from thermal_zone{}", zone_id))
+                    .with_unit("°C".to_string()),
+                ));
            }
        }
-        
+
        debug!("No CPU temperature sensors found");
        Ok(None)
    }
-    
+
    /// Read temperature from thermal zone efficiently
    async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
        let content = utils::read_proc_file(path)?;
        utils::parse_u64(content.trim())
    }
-    
+
    /// Collect CPU frequency from /proc/cpuinfo or scaling governor
    async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
        // Try scaling frequency first (more accurate for current frequency)
-        if let Ok(freq) = utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq") {
+        if let Ok(freq) =
+            utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
+        {
            if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
                let freq_mhz = freq_khz as f32 / 1000.0;
-                
-                return Ok(Some(Metric::new(
-                    registry::CPU_FREQUENCY_MHZ.to_string(),
-                    MetricValue::Float(freq_mhz),
-                    Status::Ok, // Frequency doesn't have status thresholds
-                ).with_description("Current CPU frequency".to_string())
-                .with_unit("MHz".to_string())));
+
+                return Ok(Some(
+                    Metric::new(
+                        registry::CPU_FREQUENCY_MHZ.to_string(),
+                        MetricValue::Float(freq_mhz),
+                        Status::Ok, // Frequency doesn't have status thresholds
+                    )
+                    .with_description("Current CPU frequency".to_string())
+                    .with_unit("MHz".to_string()),
+                ));
            }
        }
-        
+
        // Fallback: parse /proc/cpuinfo for base frequency
        if let Ok(content) = utils::read_proc_file("/proc/cpuinfo") {
            for line in content.lines() {
                if line.starts_with("cpu MHz") {
                    if let Some(freq_str) = line.split(':').nth(1) {
                        if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
-                            return Ok(Some(Metric::new(
-                                registry::CPU_FREQUENCY_MHZ.to_string(),
-                                MetricValue::Float(freq_mhz),
-                                Status::Ok,
-                            ).with_description("CPU base frequency from /proc/cpuinfo".to_string())
-                            .with_unit("MHz".to_string())));
+                            return Ok(Some(
+                                Metric::new(
+                                    registry::CPU_FREQUENCY_MHZ.to_string(),
+                                    MetricValue::Float(freq_mhz),
+                                    Status::Ok,
+                                )
+                                .with_description(
+                                    "CPU base frequency from /proc/cpuinfo".to_string(),
+                                )
+                                .with_unit("MHz".to_string()),
+                            ));
                        }
                    }
                    break; // Only need first CPU entry
                }
            }
        }
-        
+
        debug!("CPU frequency not available");
        Ok(None)
    }
-    
-    
 }

 #[async_trait]
@ -181,43 +199,48 @@ impl Collector for CpuCollector {
    fn name(&self) -> &str {
        &self.name
    }
-    
+
    async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
-        
        debug!("Collecting CPU metrics");
        let start = std::time::Instant::now();
-        
+
        let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
-        
+
        // Collect load averages (always available)
        metrics.extend(self.collect_load_averages().await?);
-        
+
        // Collect temperature (optional)
        if let Some(temp_metric) = self.collect_temperature().await? {
            metrics.push(temp_metric);
        }
-        
+
        // Collect frequency (optional)
        if let Some(freq_metric) = self.collect_frequency().await? {
            metrics.push(freq_metric);
        }
-        
-        
+
        let duration = start.elapsed();
-        debug!("CPU collection completed in {:?} with {} metrics", duration, metrics.len());
-        
+        debug!(
+            "CPU collection completed in {:?} with {} metrics",
+            duration,
+            metrics.len()
+        );
+
        // Efficiency check: warn if collection takes too long
        if duration.as_millis() > 1 {
-            debug!("CPU collection took {}ms - consider optimization", duration.as_millis());
+            debug!(
+                "CPU collection took {}ms - consider optimization",
+                duration.as_millis()
+            );
        }
-        
-        // Store performance metrics  
+
+        // Store performance metrics
        // Performance tracking handled by cache system
-        
+
        Ok(metrics)
    }
-    
+
    fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
        None // Performance tracking handled by cache system
    }
-}
+}