From 28896d0b1bef38ffd37f4aa689e80cd3a38a2edb Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Mon, 20 Oct 2025 11:12:15 +0200 Subject: [PATCH] Fix CPU load alerting to only trigger on 5-minute load average Only the 5-minute load average should trigger warning/critical alerts. 1-minute and 15-minute load averages now always show Status::Ok. Thresholds (Warning: 9.0, Critical: 10.0) apply only to cpu_load_5min metric. --- agent/src/collectors/cpu.rs | 169 ++++++++++++++++++++---------------- 1 file changed, 96 insertions(+), 73 deletions(-) diff --git a/agent/src/collectors/cpu.rs b/agent/src/collectors/cpu.rs index 3da4ee5..48852e0 100644 --- a/agent/src/collectors/cpu.rs +++ b/agent/src/collectors/cpu.rs @@ -1,13 +1,13 @@ use async_trait::async_trait; -use cm_dashboard_shared::{Metric, MetricValue, Status, registry}; +use cm_dashboard_shared::{registry, Metric, MetricValue, Status}; use tracing::debug; -use super::{Collector, CollectorError, utils}; +use super::{utils, Collector, CollectorError}; use crate::config::CpuConfig; /// Extremely efficient CPU metrics collector -/// +/// /// EFFICIENCY OPTIMIZATIONS: /// - Single /proc/loadavg read for all load metrics /// - Single /proc/stat read for CPU usage @@ -26,7 +26,7 @@ impl CpuCollector { name: "cpu".to_string(), } } - + /// Calculate CPU load status using configured thresholds fn calculate_load_status(&self, load: f32) -> Status { if load >= self.config.load_critical_threshold { @@ -37,7 +37,7 @@ impl CpuCollector { Status::Ok } } - + /// Calculate CPU temperature status using configured thresholds fn calculate_temperature_status(&self, temp: f32) -> Status { if temp >= self.config.temperature_critical_threshold { @@ -48,132 +48,150 @@ impl CpuCollector { Status::Ok } } - + /// Collect CPU load averages from /proc/loadavg /// Format: "0.52 0.58 0.59 1/257 12345" async fn collect_load_averages(&self) -> Result, CollectorError> { let content = utils::read_proc_file("/proc/loadavg")?; let parts: Vec<&str> = content.trim().split_whitespace().collect(); - + if parts.len() < 3 { return Err(CollectorError::Parse { value: content, error: "Expected at least 3 values in /proc/loadavg".to_string(), }); } - + let load_1min = utils::parse_f32(parts[0])?; let load_5min = utils::parse_f32(parts[1])?; let load_15min = utils::parse_f32(parts[2])?; - - // Calculate status for each load average (use 1min for primary status) - let load_1min_status = self.calculate_load_status(load_1min); - let load_5min_status = self.calculate_load_status(load_5min); - let load_15min_status = self.calculate_load_status(load_15min); - + + // Only apply thresholds to 5-minute load average + let load_1min_status = Status::Ok; // No alerting on 1min + let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts + let load_15min_status = Status::Ok; // No alerting on 15min + Ok(vec![ Metric::new( registry::CPU_LOAD_1MIN.to_string(), MetricValue::Float(load_1min), load_1min_status, - ).with_description("CPU load average over 1 minute".to_string()), - + ) + .with_description("CPU load average over 1 minute".to_string()), Metric::new( registry::CPU_LOAD_5MIN.to_string(), MetricValue::Float(load_5min), load_5min_status, - ).with_description("CPU load average over 5 minutes".to_string()), - + ) + .with_description("CPU load average over 5 minutes".to_string()), Metric::new( registry::CPU_LOAD_15MIN.to_string(), MetricValue::Float(load_15min), load_15min_status, - ).with_description("CPU load average over 15 minutes".to_string()), + ) + .with_description("CPU load average over 15 minutes".to_string()), ]) } - + /// Collect CPU temperature from thermal zones /// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior) async fn collect_temperature(&self) -> Result, CollectorError> { // Try x86_pkg_temp first (Intel CPU package temperature) - if let Ok(temp) = self.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp").await { + if let Ok(temp) = self + .read_thermal_zone("/sys/class/thermal/thermal_zone0/temp") + .await + { let temp_celsius = temp as f32 / 1000.0; let status = self.calculate_temperature_status(temp_celsius); - - return Ok(Some(Metric::new( - registry::CPU_TEMPERATURE_CELSIUS.to_string(), - MetricValue::Float(temp_celsius), - status, - ).with_description("CPU package temperature".to_string()) - .with_unit("°C".to_string()))); + + return Ok(Some( + Metric::new( + registry::CPU_TEMPERATURE_CELSIUS.to_string(), + MetricValue::Float(temp_celsius), + status, + ) + .with_description("CPU package temperature".to_string()) + .with_unit("°C".to_string()), + )); } - + // Fallback: try other thermal zones for zone_id in 0..10 { let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id); if let Ok(temp) = self.read_thermal_zone(&path).await { let temp_celsius = temp as f32 / 1000.0; let status = self.calculate_temperature_status(temp_celsius); - - return Ok(Some(Metric::new( - registry::CPU_TEMPERATURE_CELSIUS.to_string(), - MetricValue::Float(temp_celsius), - status, - ).with_description(format!("CPU temperature from thermal_zone{}", zone_id)) - .with_unit("°C".to_string()))); + + return Ok(Some( + Metric::new( + registry::CPU_TEMPERATURE_CELSIUS.to_string(), + MetricValue::Float(temp_celsius), + status, + ) + .with_description(format!("CPU temperature from thermal_zone{}", zone_id)) + .with_unit("°C".to_string()), + )); } } - + debug!("No CPU temperature sensors found"); Ok(None) } - + /// Read temperature from thermal zone efficiently async fn read_thermal_zone(&self, path: &str) -> Result { let content = utils::read_proc_file(path)?; utils::parse_u64(content.trim()) } - + /// Collect CPU frequency from /proc/cpuinfo or scaling governor async fn collect_frequency(&self) -> Result, CollectorError> { // Try scaling frequency first (more accurate for current frequency) - if let Ok(freq) = utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq") { + if let Ok(freq) = + utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq") + { if let Ok(freq_khz) = utils::parse_u64(freq.trim()) { let freq_mhz = freq_khz as f32 / 1000.0; - - return Ok(Some(Metric::new( - registry::CPU_FREQUENCY_MHZ.to_string(), - MetricValue::Float(freq_mhz), - Status::Ok, // Frequency doesn't have status thresholds - ).with_description("Current CPU frequency".to_string()) - .with_unit("MHz".to_string()))); + + return Ok(Some( + Metric::new( + registry::CPU_FREQUENCY_MHZ.to_string(), + MetricValue::Float(freq_mhz), + Status::Ok, // Frequency doesn't have status thresholds + ) + .with_description("Current CPU frequency".to_string()) + .with_unit("MHz".to_string()), + )); } } - + // Fallback: parse /proc/cpuinfo for base frequency if let Ok(content) = utils::read_proc_file("/proc/cpuinfo") { for line in content.lines() { if line.starts_with("cpu MHz") { if let Some(freq_str) = line.split(':').nth(1) { if let Ok(freq_mhz) = utils::parse_f32(freq_str) { - return Ok(Some(Metric::new( - registry::CPU_FREQUENCY_MHZ.to_string(), - MetricValue::Float(freq_mhz), - Status::Ok, - ).with_description("CPU base frequency from /proc/cpuinfo".to_string()) - .with_unit("MHz".to_string()))); + return Ok(Some( + Metric::new( + registry::CPU_FREQUENCY_MHZ.to_string(), + MetricValue::Float(freq_mhz), + Status::Ok, + ) + .with_description( + "CPU base frequency from /proc/cpuinfo".to_string(), + ) + .with_unit("MHz".to_string()), + )); } } break; // Only need first CPU entry } } } - + debug!("CPU frequency not available"); Ok(None) } - - } #[async_trait] @@ -181,43 +199,48 @@ impl Collector for CpuCollector { fn name(&self) -> &str { &self.name } - + async fn collect(&self) -> Result, CollectorError> { - debug!("Collecting CPU metrics"); let start = std::time::Instant::now(); - + let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency - + // Collect load averages (always available) metrics.extend(self.collect_load_averages().await?); - + // Collect temperature (optional) if let Some(temp_metric) = self.collect_temperature().await? { metrics.push(temp_metric); } - + // Collect frequency (optional) if let Some(freq_metric) = self.collect_frequency().await? { metrics.push(freq_metric); } - - + let duration = start.elapsed(); - debug!("CPU collection completed in {:?} with {} metrics", duration, metrics.len()); - + debug!( + "CPU collection completed in {:?} with {} metrics", + duration, + metrics.len() + ); + // Efficiency check: warn if collection takes too long if duration.as_millis() > 1 { - debug!("CPU collection took {}ms - consider optimization", duration.as_millis()); + debug!( + "CPU collection took {}ms - consider optimization", + duration.as_millis() + ); } - - // Store performance metrics + + // Store performance metrics // Performance tracking handled by cache system - + Ok(metrics) } - + fn get_performance_metrics(&self) -> Option { None // Performance tracking handled by cache system } -} \ No newline at end of file +}