Fix CPU load alerting to only trigger on 5-minute load average

Only the 5-minute load average should trigger warning/critical alerts.
1-minute and 15-minute load averages now always show Status::Ok.

Thresholds (Warning: 9.0, Critical: 10.0) apply only to cpu_load_5min metric.
This commit is contained in:
Christoffer Martinsson 2025-10-20 11:12:15 +02:00
parent 47a7d5ae62
commit 28896d0b1b

View File

@ -1,9 +1,9 @@
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
use tracing::debug;
use super::{Collector, CollectorError, utils};
use super::{utils, Collector, CollectorError};
use crate::config::CpuConfig;
/// Extremely efficient CPU metrics collector
@ -66,29 +66,30 @@ impl CpuCollector {
let load_5min = utils::parse_f32(parts[1])?;
let load_15min = utils::parse_f32(parts[2])?;
// Calculate status for each load average (use 1min for primary status)
let load_1min_status = self.calculate_load_status(load_1min);
let load_5min_status = self.calculate_load_status(load_5min);
let load_15min_status = self.calculate_load_status(load_15min);
// Only apply thresholds to 5-minute load average
let load_1min_status = Status::Ok; // No alerting on 1min
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
let load_15min_status = Status::Ok; // No alerting on 15min
Ok(vec![
Metric::new(
registry::CPU_LOAD_1MIN.to_string(),
MetricValue::Float(load_1min),
load_1min_status,
).with_description("CPU load average over 1 minute".to_string()),
)
.with_description("CPU load average over 1 minute".to_string()),
Metric::new(
registry::CPU_LOAD_5MIN.to_string(),
MetricValue::Float(load_5min),
load_5min_status,
).with_description("CPU load average over 5 minutes".to_string()),
)
.with_description("CPU load average over 5 minutes".to_string()),
Metric::new(
registry::CPU_LOAD_15MIN.to_string(),
MetricValue::Float(load_15min),
load_15min_status,
).with_description("CPU load average over 15 minutes".to_string()),
)
.with_description("CPU load average over 15 minutes".to_string()),
])
}
@ -96,16 +97,22 @@ impl CpuCollector {
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
// Try x86_pkg_temp first (Intel CPU package temperature)
if let Ok(temp) = self.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp").await {
if let Ok(temp) = self
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
.await
{
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
return Ok(Some(Metric::new(
return Ok(Some(
Metric::new(
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
MetricValue::Float(temp_celsius),
status,
).with_description("CPU package temperature".to_string())
.with_unit("°C".to_string())));
)
.with_description("CPU package temperature".to_string())
.with_unit("°C".to_string()),
));
}
// Fallback: try other thermal zones
@ -115,12 +122,15 @@ impl CpuCollector {
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
return Ok(Some(Metric::new(
return Ok(Some(
Metric::new(
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
MetricValue::Float(temp_celsius),
status,
).with_description(format!("CPU temperature from thermal_zone{}", zone_id))
.with_unit("°C".to_string())));
)
.with_description(format!("CPU temperature from thermal_zone{}", zone_id))
.with_unit("°C".to_string()),
));
}
}
@ -137,16 +147,21 @@ impl CpuCollector {
/// Collect CPU frequency from /proc/cpuinfo or scaling governor
async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
// Try scaling frequency first (more accurate for current frequency)
if let Ok(freq) = utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq") {
if let Ok(freq) =
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
{
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
let freq_mhz = freq_khz as f32 / 1000.0;
return Ok(Some(Metric::new(
return Ok(Some(
Metric::new(
registry::CPU_FREQUENCY_MHZ.to_string(),
MetricValue::Float(freq_mhz),
Status::Ok, // Frequency doesn't have status thresholds
).with_description("Current CPU frequency".to_string())
.with_unit("MHz".to_string())));
)
.with_description("Current CPU frequency".to_string())
.with_unit("MHz".to_string()),
));
}
}
@ -156,12 +171,17 @@ impl CpuCollector {
if line.starts_with("cpu MHz") {
if let Some(freq_str) = line.split(':').nth(1) {
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
return Ok(Some(Metric::new(
return Ok(Some(
Metric::new(
registry::CPU_FREQUENCY_MHZ.to_string(),
MetricValue::Float(freq_mhz),
Status::Ok,
).with_description("CPU base frequency from /proc/cpuinfo".to_string())
.with_unit("MHz".to_string())));
)
.with_description(
"CPU base frequency from /proc/cpuinfo".to_string(),
)
.with_unit("MHz".to_string()),
));
}
}
break; // Only need first CPU entry
@ -172,8 +192,6 @@ impl CpuCollector {
debug!("CPU frequency not available");
Ok(None)
}
}
#[async_trait]
@ -183,7 +201,6 @@ impl Collector for CpuCollector {
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
debug!("Collecting CPU metrics");
let start = std::time::Instant::now();
@ -202,13 +219,19 @@ impl Collector for CpuCollector {
metrics.push(freq_metric);
}
let duration = start.elapsed();
debug!("CPU collection completed in {:?} with {} metrics", duration, metrics.len());
debug!(
"CPU collection completed in {:?} with {} metrics",
duration,
metrics.len()
);
// Efficiency check: warn if collection takes too long
if duration.as_millis() > 1 {
debug!("CPU collection took {}ms - consider optimization", duration.as_millis());
debug!(
"CPU collection took {}ms - consider optimization",
duration.as_millis()
);
}
// Store performance metrics