Fix CPU load alerting to only trigger on 5-minute load average
Only the 5-minute load average should trigger warning/critical alerts. 1-minute and 15-minute load averages now always show Status::Ok. Thresholds (Warning: 9.0, Critical: 10.0) apply only to cpu_load_5min metric.
This commit is contained in:
parent
47a7d5ae62
commit
28896d0b1b
@ -1,13 +1,13 @@
|
|||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cm_dashboard_shared::{Metric, MetricValue, Status, registry};
|
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
|
||||||
|
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
use super::{Collector, CollectorError, utils};
|
use super::{utils, Collector, CollectorError};
|
||||||
use crate::config::CpuConfig;
|
use crate::config::CpuConfig;
|
||||||
|
|
||||||
/// Extremely efficient CPU metrics collector
|
/// Extremely efficient CPU metrics collector
|
||||||
///
|
///
|
||||||
/// EFFICIENCY OPTIMIZATIONS:
|
/// EFFICIENCY OPTIMIZATIONS:
|
||||||
/// - Single /proc/loadavg read for all load metrics
|
/// - Single /proc/loadavg read for all load metrics
|
||||||
/// - Single /proc/stat read for CPU usage
|
/// - Single /proc/stat read for CPU usage
|
||||||
@ -26,7 +26,7 @@ impl CpuCollector {
|
|||||||
name: "cpu".to_string(),
|
name: "cpu".to_string(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate CPU load status using configured thresholds
|
/// Calculate CPU load status using configured thresholds
|
||||||
fn calculate_load_status(&self, load: f32) -> Status {
|
fn calculate_load_status(&self, load: f32) -> Status {
|
||||||
if load >= self.config.load_critical_threshold {
|
if load >= self.config.load_critical_threshold {
|
||||||
@ -37,7 +37,7 @@ impl CpuCollector {
|
|||||||
Status::Ok
|
Status::Ok
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculate CPU temperature status using configured thresholds
|
/// Calculate CPU temperature status using configured thresholds
|
||||||
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
||||||
if temp >= self.config.temperature_critical_threshold {
|
if temp >= self.config.temperature_critical_threshold {
|
||||||
@ -48,132 +48,150 @@ impl CpuCollector {
|
|||||||
Status::Ok
|
Status::Ok
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect CPU load averages from /proc/loadavg
|
/// Collect CPU load averages from /proc/loadavg
|
||||||
/// Format: "0.52 0.58 0.59 1/257 12345"
|
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||||
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||||
let content = utils::read_proc_file("/proc/loadavg")?;
|
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||||
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||||
|
|
||||||
if parts.len() < 3 {
|
if parts.len() < 3 {
|
||||||
return Err(CollectorError::Parse {
|
return Err(CollectorError::Parse {
|
||||||
value: content,
|
value: content,
|
||||||
error: "Expected at least 3 values in /proc/loadavg".to_string(),
|
error: "Expected at least 3 values in /proc/loadavg".to_string(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let load_1min = utils::parse_f32(parts[0])?;
|
let load_1min = utils::parse_f32(parts[0])?;
|
||||||
let load_5min = utils::parse_f32(parts[1])?;
|
let load_5min = utils::parse_f32(parts[1])?;
|
||||||
let load_15min = utils::parse_f32(parts[2])?;
|
let load_15min = utils::parse_f32(parts[2])?;
|
||||||
|
|
||||||
// Calculate status for each load average (use 1min for primary status)
|
// Only apply thresholds to 5-minute load average
|
||||||
let load_1min_status = self.calculate_load_status(load_1min);
|
let load_1min_status = Status::Ok; // No alerting on 1min
|
||||||
let load_5min_status = self.calculate_load_status(load_5min);
|
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
|
||||||
let load_15min_status = self.calculate_load_status(load_15min);
|
let load_15min_status = Status::Ok; // No alerting on 15min
|
||||||
|
|
||||||
Ok(vec![
|
Ok(vec![
|
||||||
Metric::new(
|
Metric::new(
|
||||||
registry::CPU_LOAD_1MIN.to_string(),
|
registry::CPU_LOAD_1MIN.to_string(),
|
||||||
MetricValue::Float(load_1min),
|
MetricValue::Float(load_1min),
|
||||||
load_1min_status,
|
load_1min_status,
|
||||||
).with_description("CPU load average over 1 minute".to_string()),
|
)
|
||||||
|
.with_description("CPU load average over 1 minute".to_string()),
|
||||||
Metric::new(
|
Metric::new(
|
||||||
registry::CPU_LOAD_5MIN.to_string(),
|
registry::CPU_LOAD_5MIN.to_string(),
|
||||||
MetricValue::Float(load_5min),
|
MetricValue::Float(load_5min),
|
||||||
load_5min_status,
|
load_5min_status,
|
||||||
).with_description("CPU load average over 5 minutes".to_string()),
|
)
|
||||||
|
.with_description("CPU load average over 5 minutes".to_string()),
|
||||||
Metric::new(
|
Metric::new(
|
||||||
registry::CPU_LOAD_15MIN.to_string(),
|
registry::CPU_LOAD_15MIN.to_string(),
|
||||||
MetricValue::Float(load_15min),
|
MetricValue::Float(load_15min),
|
||||||
load_15min_status,
|
load_15min_status,
|
||||||
).with_description("CPU load average over 15 minutes".to_string()),
|
)
|
||||||
|
.with_description("CPU load average over 15 minutes".to_string()),
|
||||||
])
|
])
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect CPU temperature from thermal zones
|
/// Collect CPU temperature from thermal zones
|
||||||
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
||||||
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
|
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
|
||||||
// Try x86_pkg_temp first (Intel CPU package temperature)
|
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||||
if let Ok(temp) = self.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp").await {
|
if let Ok(temp) = self
|
||||||
|
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||||
|
.await
|
||||||
|
{
|
||||||
let temp_celsius = temp as f32 / 1000.0;
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
let status = self.calculate_temperature_status(temp_celsius);
|
let status = self.calculate_temperature_status(temp_celsius);
|
||||||
|
|
||||||
return Ok(Some(Metric::new(
|
return Ok(Some(
|
||||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
Metric::new(
|
||||||
MetricValue::Float(temp_celsius),
|
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||||
status,
|
MetricValue::Float(temp_celsius),
|
||||||
).with_description("CPU package temperature".to_string())
|
status,
|
||||||
.with_unit("°C".to_string())));
|
)
|
||||||
|
.with_description("CPU package temperature".to_string())
|
||||||
|
.with_unit("°C".to_string()),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: try other thermal zones
|
// Fallback: try other thermal zones
|
||||||
for zone_id in 0..10 {
|
for zone_id in 0..10 {
|
||||||
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||||
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||||
let temp_celsius = temp as f32 / 1000.0;
|
let temp_celsius = temp as f32 / 1000.0;
|
||||||
let status = self.calculate_temperature_status(temp_celsius);
|
let status = self.calculate_temperature_status(temp_celsius);
|
||||||
|
|
||||||
return Ok(Some(Metric::new(
|
return Ok(Some(
|
||||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
Metric::new(
|
||||||
MetricValue::Float(temp_celsius),
|
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||||
status,
|
MetricValue::Float(temp_celsius),
|
||||||
).with_description(format!("CPU temperature from thermal_zone{}", zone_id))
|
status,
|
||||||
.with_unit("°C".to_string())));
|
)
|
||||||
|
.with_description(format!("CPU temperature from thermal_zone{}", zone_id))
|
||||||
|
.with_unit("°C".to_string()),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("No CPU temperature sensors found");
|
debug!("No CPU temperature sensors found");
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read temperature from thermal zone efficiently
|
/// Read temperature from thermal zone efficiently
|
||||||
async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
|
async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
|
||||||
let content = utils::read_proc_file(path)?;
|
let content = utils::read_proc_file(path)?;
|
||||||
utils::parse_u64(content.trim())
|
utils::parse_u64(content.trim())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect CPU frequency from /proc/cpuinfo or scaling governor
|
/// Collect CPU frequency from /proc/cpuinfo or scaling governor
|
||||||
async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
|
async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
|
||||||
// Try scaling frequency first (more accurate for current frequency)
|
// Try scaling frequency first (more accurate for current frequency)
|
||||||
if let Ok(freq) = utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq") {
|
if let Ok(freq) =
|
||||||
|
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
|
||||||
|
{
|
||||||
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
|
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
|
||||||
let freq_mhz = freq_khz as f32 / 1000.0;
|
let freq_mhz = freq_khz as f32 / 1000.0;
|
||||||
|
|
||||||
return Ok(Some(Metric::new(
|
return Ok(Some(
|
||||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
Metric::new(
|
||||||
MetricValue::Float(freq_mhz),
|
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||||
Status::Ok, // Frequency doesn't have status thresholds
|
MetricValue::Float(freq_mhz),
|
||||||
).with_description("Current CPU frequency".to_string())
|
Status::Ok, // Frequency doesn't have status thresholds
|
||||||
.with_unit("MHz".to_string())));
|
)
|
||||||
|
.with_description("Current CPU frequency".to_string())
|
||||||
|
.with_unit("MHz".to_string()),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: parse /proc/cpuinfo for base frequency
|
// Fallback: parse /proc/cpuinfo for base frequency
|
||||||
if let Ok(content) = utils::read_proc_file("/proc/cpuinfo") {
|
if let Ok(content) = utils::read_proc_file("/proc/cpuinfo") {
|
||||||
for line in content.lines() {
|
for line in content.lines() {
|
||||||
if line.starts_with("cpu MHz") {
|
if line.starts_with("cpu MHz") {
|
||||||
if let Some(freq_str) = line.split(':').nth(1) {
|
if let Some(freq_str) = line.split(':').nth(1) {
|
||||||
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
|
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
|
||||||
return Ok(Some(Metric::new(
|
return Ok(Some(
|
||||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
Metric::new(
|
||||||
MetricValue::Float(freq_mhz),
|
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||||
Status::Ok,
|
MetricValue::Float(freq_mhz),
|
||||||
).with_description("CPU base frequency from /proc/cpuinfo".to_string())
|
Status::Ok,
|
||||||
.with_unit("MHz".to_string())));
|
)
|
||||||
|
.with_description(
|
||||||
|
"CPU base frequency from /proc/cpuinfo".to_string(),
|
||||||
|
)
|
||||||
|
.with_unit("MHz".to_string()),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break; // Only need first CPU entry
|
break; // Only need first CPU entry
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("CPU frequency not available");
|
debug!("CPU frequency not available");
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
@ -181,43 +199,48 @@ impl Collector for CpuCollector {
|
|||||||
fn name(&self) -> &str {
|
fn name(&self) -> &str {
|
||||||
&self.name
|
&self.name
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||||
|
|
||||||
debug!("Collecting CPU metrics");
|
debug!("Collecting CPU metrics");
|
||||||
let start = std::time::Instant::now();
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
||||||
|
|
||||||
// Collect load averages (always available)
|
// Collect load averages (always available)
|
||||||
metrics.extend(self.collect_load_averages().await?);
|
metrics.extend(self.collect_load_averages().await?);
|
||||||
|
|
||||||
// Collect temperature (optional)
|
// Collect temperature (optional)
|
||||||
if let Some(temp_metric) = self.collect_temperature().await? {
|
if let Some(temp_metric) = self.collect_temperature().await? {
|
||||||
metrics.push(temp_metric);
|
metrics.push(temp_metric);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect frequency (optional)
|
// Collect frequency (optional)
|
||||||
if let Some(freq_metric) = self.collect_frequency().await? {
|
if let Some(freq_metric) = self.collect_frequency().await? {
|
||||||
metrics.push(freq_metric);
|
metrics.push(freq_metric);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
let duration = start.elapsed();
|
let duration = start.elapsed();
|
||||||
debug!("CPU collection completed in {:?} with {} metrics", duration, metrics.len());
|
debug!(
|
||||||
|
"CPU collection completed in {:?} with {} metrics",
|
||||||
|
duration,
|
||||||
|
metrics.len()
|
||||||
|
);
|
||||||
|
|
||||||
// Efficiency check: warn if collection takes too long
|
// Efficiency check: warn if collection takes too long
|
||||||
if duration.as_millis() > 1 {
|
if duration.as_millis() > 1 {
|
||||||
debug!("CPU collection took {}ms - consider optimization", duration.as_millis());
|
debug!(
|
||||||
|
"CPU collection took {}ms - consider optimization",
|
||||||
|
duration.as_millis()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store performance metrics
|
// Store performance metrics
|
||||||
// Performance tracking handled by cache system
|
// Performance tracking handled by cache system
|
||||||
|
|
||||||
Ok(metrics)
|
Ok(metrics)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
|
fn get_performance_metrics(&self) -> Option<super::PerformanceMetrics> {
|
||||||
None // Performance tracking handled by cache system
|
None // Performance tracking handled by cache system
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user