Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -1,6 +1,6 @@
use anyhow::Result;
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
use crate::config::DiskConfig;
use std::fs;
@@ -28,11 +28,28 @@ struct MountedDisk {
/// Disk usage collector for monitoring filesystem sizes
pub struct DiskCollector {
config: DiskConfig,
temperature_thresholds: HysteresisThresholds,
}
impl DiskCollector {
pub fn new(config: DiskConfig) -> Self {
Self { config }
// Create hysteresis thresholds for disk temperature
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
60.0, // warning at 60°C
5.0, // 5°C gap for recovery
70.0, // critical at 70°C
5.0, // 5°C gap for recovery
);
Self {
config,
temperature_thresholds,
}
}
/// Calculate disk temperature status using hysteresis thresholds
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
}
/// Resolve UUID to actual device path
@@ -203,12 +220,6 @@ impl DiskCollector {
Ok((total_bytes, used_bytes))
}
/// Get root filesystem disk usage
fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
Ok((total_bytes, used_bytes, usage_percent as f32))
}
/// Get the physical device for a given device (resolves symlinks, gets parent device)
@@ -339,7 +350,7 @@ impl Collector for DiskCollector {
"disk"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let start_time = Instant::now();
debug!("Collecting multi-disk metrics");
@@ -497,13 +508,8 @@ impl Collector for DiskCollector {
});
if temperature > 0.0 {
let temp_status = if temperature >= 70.0 {
Status::Critical
} else if temperature >= 60.0 {
Status::Warning
} else {
Status::Ok
};
let metric_name = format!("disk_smart_{}_temperature", device_name);
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
metrics.push(Metric {
name: format!("disk_smart_{}_temperature", device_name),