Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -1,5 +1,6 @@
use serde::{Deserialize, Serialize};
use chrono::Utc;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
/// Individual metric with value, status, and metadata
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -23,12 +24,12 @@ impl Metric {
unit: None,
}
}
pub fn with_description(mut self, description: String) -> Self {
self.description = Some(description);
self
}
pub fn with_unit(mut self, unit: String) -> Self {
self.unit = Some(unit);
self
@@ -52,7 +53,7 @@ impl MetricValue {
_ => None,
}
}
pub fn as_i64(&self) -> Option<i64> {
match self {
MetricValue::Integer(i) => Some(*i),
@@ -60,7 +61,7 @@ impl MetricValue {
_ => None,
}
}
pub fn as_string(&self) -> String {
match self {
MetricValue::String(s) => s.clone(),
@@ -69,7 +70,7 @@ impl MetricValue {
MetricValue::Boolean(b) => b.to_string(),
}
}
pub fn as_bool(&self) -> Option<bool> {
match self {
MetricValue::Boolean(b) => Some(*b),
@@ -100,6 +101,118 @@ impl Default for Status {
}
}
/// Hysteresis thresholds for preventing status flapping
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HysteresisThresholds {
/// Warning threshold - trigger warning when value >= this
pub warning_high: f32,
/// Warning recovery - return to ok when value < this
pub warning_low: f32,
/// Critical threshold - trigger critical when value >= this
pub critical_high: f32,
/// Critical recovery - return to warning when value < this
pub critical_low: f32,
}
impl HysteresisThresholds {
pub fn new(warning_high: f32, critical_high: f32) -> Self {
// Default hysteresis: 10% gap for recovery
let warning_gap = warning_high * 0.1;
let critical_gap = critical_high * 0.1;
Self {
warning_high,
warning_low: warning_high - warning_gap,
critical_high,
critical_low: critical_high - critical_gap,
}
}
pub fn with_custom_gaps(warning_high: f32, warning_gap: f32, critical_high: f32, critical_gap: f32) -> Self {
Self {
warning_high,
warning_low: warning_high - warning_gap,
critical_high,
critical_low: critical_high - critical_gap,
}
}
/// Calculate status with hysteresis based on current value and previous status
pub fn calculate_status(&self, value: f32, previous_status: Status) -> Status {
match previous_status {
Status::Ok => {
if value >= self.critical_high {
Status::Critical
} else if value >= self.warning_high {
Status::Warning
} else {
Status::Ok
}
}
Status::Warning => {
if value >= self.critical_high {
Status::Critical
} else if value < self.warning_low {
Status::Ok
} else {
Status::Warning
}
}
Status::Critical => {
if value < self.critical_low {
if value < self.warning_low {
Status::Ok
} else {
Status::Warning
}
} else {
Status::Critical
}
}
Status::Unknown => {
// First measurement, use normal thresholds
if value >= self.critical_high {
Status::Critical
} else if value >= self.warning_high {
Status::Warning
} else {
Status::Ok
}
}
}
}
}
/// Status tracker for hysteresis - tracks previous status per metric
#[derive(Debug, Default)]
pub struct StatusTracker {
previous_statuses: HashMap<String, Status>,
}
impl StatusTracker {
pub fn new() -> Self {
Self::default()
}
/// Get previous status for a metric
pub fn get_previous_status(&self, metric_name: &str) -> Status {
self.previous_statuses.get(metric_name).copied().unwrap_or(Status::Unknown)
}
/// Update status for a metric
pub fn update_status(&mut self, metric_name: String, status: Status) {
self.previous_statuses.insert(metric_name, status);
}
/// Calculate status with hysteresis
pub fn calculate_with_hysteresis(&mut self, metric_name: &str, value: f32, thresholds: &HysteresisThresholds) -> Status {
let previous = self.get_previous_status(metric_name);
let new_status = thresholds.calculate_status(value, previous);
self.update_status(metric_name.to_string(), new_status);
new_status
}
}
/// Metric name registry - constants for all metric names
pub mod registry {
// CPU metrics
@@ -109,7 +222,7 @@ pub mod registry {
pub const CPU_TEMPERATURE_CELSIUS: &str = "cpu_temperature_celsius";
pub const CPU_FREQUENCY_MHZ: &str = "cpu_frequency_mhz";
pub const CPU_USAGE_PERCENT: &str = "cpu_usage_percent";
// Memory metrics
pub const MEMORY_USAGE_PERCENT: &str = "memory_usage_percent";
pub const MEMORY_TOTAL_GB: &str = "memory_total_gb";
@@ -117,7 +230,7 @@ pub mod registry {
pub const MEMORY_AVAILABLE_GB: &str = "memory_available_gb";
pub const MEMORY_SWAP_TOTAL_GB: &str = "memory_swap_total_gb";
pub const MEMORY_SWAP_USED_GB: &str = "memory_swap_used_gb";
// Disk metrics (template - actual names include device)
pub const DISK_USAGE_PERCENT_TEMPLATE: &str = "disk_{device}_usage_percent";
pub const DISK_TEMPERATURE_CELSIUS_TEMPLATE: &str = "disk_{device}_temperature_celsius";
@@ -125,37 +238,37 @@ pub mod registry {
pub const DISK_SPARE_PERCENT_TEMPLATE: &str = "disk_{device}_spare_percent";
pub const DISK_HOURS_TEMPLATE: &str = "disk_{device}_hours";
pub const DISK_CAPACITY_GB_TEMPLATE: &str = "disk_{device}_capacity_gb";
// Service metrics (template - actual names include service)
pub const SERVICE_STATUS_TEMPLATE: &str = "service_{name}_status";
pub const SERVICE_MEMORY_MB_TEMPLATE: &str = "service_{name}_memory_mb";
pub const SERVICE_CPU_PERCENT_TEMPLATE: &str = "service_{name}_cpu_percent";
// Backup metrics
pub const BACKUP_STATUS: &str = "backup_status";
pub const BACKUP_LAST_RUN_TIMESTAMP: &str = "backup_last_run_timestamp";
pub const BACKUP_SIZE_GB: &str = "backup_size_gb";
pub const BACKUP_DURATION_MINUTES: &str = "backup_duration_minutes";
pub const BACKUP_NEXT_SCHEDULED_TIMESTAMP: &str = "backup_next_scheduled_timestamp";
// Network metrics (template - actual names include interface)
pub const NETWORK_RX_BYTES_TEMPLATE: &str = "network_{interface}_rx_bytes";
pub const NETWORK_TX_BYTES_TEMPLATE: &str = "network_{interface}_tx_bytes";
pub const NETWORK_RX_PACKETS_TEMPLATE: &str = "network_{interface}_rx_packets";
pub const NETWORK_TX_PACKETS_TEMPLATE: &str = "network_{interface}_tx_packets";
/// Generate disk metric name from template
pub fn disk_metric(template: &str, device: &str) -> String {
template.replace("{device}", device)
}
/// Generate service metric name from template
pub fn service_metric(template: &str, name: &str) -> String {
template.replace("{name}", name)
}
/// Generate network metric name from template
pub fn network_metric(template: &str, interface: &str) -> String {
template.replace("{interface}", interface)
}
}
}