Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -1,5 +1,5 @@
use async_trait::async_trait;
use cm_dashboard_shared::{registry, Metric, MetricValue, Status};
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use tracing::debug;
@@ -17,41 +17,44 @@ use crate::config::CpuConfig;
pub struct CpuCollector {
config: CpuConfig,
name: String,
load_thresholds: HysteresisThresholds,
temperature_thresholds: HysteresisThresholds,
}
impl CpuCollector {
pub fn new(config: CpuConfig) -> Self {
// Create hysteresis thresholds with 10% gap for recovery
let load_thresholds = HysteresisThresholds::new(
config.load_warning_threshold,
config.load_critical_threshold,
);
let temperature_thresholds = HysteresisThresholds::new(
config.temperature_warning_threshold,
config.temperature_critical_threshold,
);
Self {
config,
name: "cpu".to_string(),
load_thresholds,
temperature_thresholds,
}
}
/// Calculate CPU load status using configured thresholds
fn calculate_load_status(&self, load: f32) -> Status {
if load >= self.config.load_critical_threshold {
Status::Critical
} else if load >= self.config.load_warning_threshold {
Status::Warning
} else {
Status::Ok
}
/// Calculate CPU load status using hysteresis thresholds
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
}
/// Calculate CPU temperature status using configured thresholds
fn calculate_temperature_status(&self, temp: f32) -> Status {
if temp >= self.config.temperature_critical_threshold {
Status::Critical
} else if temp >= self.config.temperature_warning_threshold {
Status::Warning
} else {
Status::Ok
}
/// Calculate CPU temperature status using hysteresis thresholds
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
}
/// Collect CPU load averages from /proc/loadavg
/// Format: "0.52 0.58 0.59 1/257 12345"
async fn collect_load_averages(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let content = utils::read_proc_file("/proc/loadavg")?;
let parts: Vec<&str> = content.trim().split_whitespace().collect();
@@ -68,7 +71,7 @@ impl CpuCollector {
// Only apply thresholds to 5-minute load average
let load_1min_status = Status::Ok; // No alerting on 1min
let load_5min_status = self.calculate_load_status(load_5min); // Only 5min triggers alerts
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
let load_15min_status = Status::Ok; // No alerting on 15min
Ok(vec![
@@ -95,14 +98,14 @@ impl CpuCollector {
/// Collect CPU temperature from thermal zones
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
async fn collect_temperature(&self) -> Result<Option<Metric>, CollectorError> {
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
// Try x86_pkg_temp first (Intel CPU package temperature)
if let Ok(temp) = self
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
.await
{
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
return Ok(Some(
Metric::new(
@@ -120,7 +123,7 @@ impl CpuCollector {
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
if let Ok(temp) = self.read_thermal_zone(&path).await {
let temp_celsius = temp as f32 / 1000.0;
let status = self.calculate_temperature_status(temp_celsius);
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
return Ok(Some(
Metric::new(
@@ -200,17 +203,17 @@ impl Collector for CpuCollector {
&self.name
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
debug!("Collecting CPU metrics");
let start = std::time::Instant::now();
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
// Collect load averages (always available)
metrics.extend(self.collect_load_averages().await?);
metrics.extend(self.collect_load_averages(status_tracker).await?);
// Collect temperature (optional)
if let Some(temp_metric) = self.collect_temperature().await? {
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
metrics.push(temp_metric);
}