Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
@@ -17,41 +17,44 @@ use crate::config::CpuConfig;
|
||||
pub struct CpuCollector {
|
||||
config: CpuConfig,
|
||||
name: String,
|
||||
load_thresholds: HysteresisThresholds,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
impl CpuCollector {
|
||||
pub fn new(config: CpuConfig) -> Self {
|
||||
// Create hysteresis thresholds with 10% gap for recovery
|
||||
let load_thresholds = HysteresisThresholds::new(
|
||||
config.load_warning_threshold,
|
||||
config.load_critical_threshold,
|
||||
);
|
||||
|
||||
let temperature_thresholds = HysteresisThresholds::new(
|
||||
config.temperature_warning_threshold,
|
||||
config.temperature_critical_threshold,
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
name: "cpu".to_string(),
|
||||
load_thresholds,
|
||||
temperature_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate CPU load status using configured thresholds
|
||||
fn calculate_load_status(&self, load: f32) -> Status {
|
||||
if load >= self.config.load_critical_threshold {
|
||||
Status::Critical
|
||||
} else if load >= self.config.load_warning_threshold {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
/// Calculate CPU load status using hysteresis thresholds
|
||||
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
|
||||
}
|
||||
|
||||
/// Calculate CPU temperature status using configured thresholds
|
||||
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
||||
if temp >= self.config.temperature_critical_threshold {
|
||||
Status::Critical
|
||||
} else if temp >= self.config.temperature_warning_threshold {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
/// Calculate CPU temperature status using hysteresis thresholds
|
||||
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
|
||||
}
|
||||
|
||||
/// Collect CPU load averages from /proc/loadavg
|
||||
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||
|
||||
@@ -68,7 +71,7 @@ impl CpuCollector {
|
||||
|
||||
// Only apply thresholds to 5-minute load average
|
||||
let load_1min_status = Status::Ok; // No alerting on 1min
|
||||
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
|
||||
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
|
||||
let load_15min_status = Status::Ok; // No alerting on 15min
|
||||
|
||||
Ok(vec![
|
||||
@@ -95,14 +98,14 @@ impl CpuCollector {
|
||||
|
||||
/// Collect CPU temperature from thermal zones
|
||||
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
||||
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
|
||||
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
|
||||
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||
if let Ok(temp) = self
|
||||
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||
.await
|
||||
{
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(temp_celsius);
|
||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
@@ -120,7 +123,7 @@ impl CpuCollector {
|
||||
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(temp_celsius);
|
||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
@@ -200,17 +203,17 @@ impl Collector for CpuCollector {
|
||||
&self.name
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
debug!("Collecting CPU metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
||||
|
||||
// Collect load averages (always available)
|
||||
metrics.extend(self.collect_load_averages().await?);
|
||||
metrics.extend(self.collect_load_averages(status_tracker).await?);
|
||||
|
||||
// Collect temperature (optional)
|
||||
if let Some(temp_metric) = self.collect_temperature().await? {
|
||||
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
|
||||
metrics.push(temp_metric);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user