Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use crate::config::DiskConfig;
|
||||
use std::fs;
|
||||
@@ -28,11 +28,28 @@ struct MountedDisk {
|
||||
/// Disk usage collector for monitoring filesystem sizes
|
||||
pub struct DiskCollector {
|
||||
config: DiskConfig,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
impl DiskCollector {
|
||||
pub fn new(config: DiskConfig) -> Self {
|
||||
Self { config }
|
||||
// Create hysteresis thresholds for disk temperature
|
||||
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
60.0, // warning at 60°C
|
||||
5.0, // 5°C gap for recovery
|
||||
70.0, // critical at 70°C
|
||||
5.0, // 5°C gap for recovery
|
||||
);
|
||||
|
||||
Self {
|
||||
config,
|
||||
temperature_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate disk temperature status using hysteresis thresholds
|
||||
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
||||
}
|
||||
|
||||
/// Resolve UUID to actual device path
|
||||
@@ -203,12 +220,6 @@ impl DiskCollector {
|
||||
Ok((total_bytes, used_bytes))
|
||||
}
|
||||
|
||||
/// Get root filesystem disk usage
|
||||
fn get_root_filesystem_usage(&self) -> Result<(u64, u64, f32)> {
|
||||
let (total_bytes, used_bytes) = self.get_filesystem_info("/")?;
|
||||
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
|
||||
Ok((total_bytes, used_bytes, usage_percent as f32))
|
||||
}
|
||||
|
||||
|
||||
/// Get the physical device for a given device (resolves symlinks, gets parent device)
|
||||
@@ -339,7 +350,7 @@ impl Collector for DiskCollector {
|
||||
"disk"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting multi-disk metrics");
|
||||
|
||||
@@ -497,13 +508,8 @@ impl Collector for DiskCollector {
|
||||
});
|
||||
|
||||
if temperature > 0.0 {
|
||||
let temp_status = if temperature >= 70.0 {
|
||||
Status::Critical
|
||||
} else if temperature >= 60.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
let metric_name = format!("disk_smart_{}_temperature", device_name);
|
||||
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_smart_{}_temperature", device_name),
|
||||
|
||||
Reference in New Issue
Block a user