Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -24,29 +24,47 @@ pub struct CacheConfig {
impl Default for CacheConfig {
fn default() -> Self {
let mut tiers = HashMap::new();
tiers.insert("realtime".to_string(), CacheTier {
interval_seconds: 2,
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)".to_string(),
});
tiers.insert("disk_light".to_string(), CacheTier {
interval_seconds: 60,
description: "Light disk operations - 1 minute (service status checks)".to_string(),
});
tiers.insert("disk_medium".to_string(), CacheTier {
interval_seconds: 300,
description: "Medium disk operations - 5 minutes (disk usage, service disk)".to_string(),
});
tiers.insert("disk_heavy".to_string(), CacheTier {
interval_seconds: 900,
description: "Heavy disk operations - 15 minutes (SMART data, backup status)".to_string(),
});
tiers.insert("static".to_string(), CacheTier {
interval_seconds: 3600,
description: "Hardware info that rarely changes - 1 hour".to_string(),
});
tiers.insert(
"realtime".to_string(),
CacheTier {
interval_seconds: 2,
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)"
.to_string(),
},
);
tiers.insert(
"disk_light".to_string(),
CacheTier {
interval_seconds: 10,
description: "Light disk operations - 10 seconds (service status checks)".to_string(),
},
);
tiers.insert(
"disk_medium".to_string(),
CacheTier {
interval_seconds: 60,
description: "Medium disk operations - 1 minute (disk usage, service disk)"
.to_string(),
},
);
tiers.insert(
"disk_heavy".to_string(),
CacheTier {
interval_seconds: 60,
description: "Heavy disk operations - 1 minute (backup status)"
.to_string(),
},
);
tiers.insert(
"static".to_string(),
CacheTier {
interval_seconds: 600,
description: "SMART data operations - 10 minutes".to_string(),
},
);
let mut metric_assignments = HashMap::new();
// REALTIME (2s) - Memory/CPU operations, no disk I/O
metric_assignments.insert("cpu_load_*".to_string(), "realtime".to_string());
metric_assignments.insert("cpu_temperature_*".to_string(), "realtime".to_string());
@@ -55,22 +73,24 @@ impl Default for CacheConfig {
metric_assignments.insert("service_*_cpu_percent".to_string(), "realtime".to_string());
metric_assignments.insert("service_*_memory_mb".to_string(), "realtime".to_string());
metric_assignments.insert("network_*".to_string(), "realtime".to_string());
// DISK_LIGHT (1min) - Light disk operations: service status checks
metric_assignments.insert("service_*_status".to_string(), "disk_light".to_string());
// DISK_MEDIUM (5min) - Medium disk operations: du commands, disk usage
metric_assignments.insert("service_*_disk_gb".to_string(), "disk_medium".to_string());
metric_assignments.insert("disk_tmp_*".to_string(), "disk_medium".to_string());
metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
// DISK_HEAVY (15min) - Heavy disk operations: SMART data, backup status
metric_assignments.insert("disk_*_temperature".to_string(), "disk_heavy".to_string());
metric_assignments.insert("disk_*_wear_percent".to_string(), "disk_heavy".to_string());
metric_assignments.insert("smart_*".to_string(), "disk_heavy".to_string());
// DISK_HEAVY (1min) - Heavy disk operations: backup status
metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());
// STATIC (10min) - SMART data operations
metric_assignments.insert("disk_*_temperature".to_string(), "static".to_string());
metric_assignments.insert("disk_*_wear_percent".to_string(), "static".to_string());
metric_assignments.insert("smart_*".to_string(), "static".to_string());
Self {
enabled: true,
default_ttl_seconds: 30,
@@ -101,11 +121,11 @@ impl CacheConfig {
if pattern.contains('*') {
// Convert pattern to regex-like matching
let pattern_parts: Vec<&str> = pattern.split('*').collect();
if pattern_parts.len() == 2 {
let prefix = pattern_parts[0];
let suffix = pattern_parts[1];
if suffix.is_empty() {
// Pattern like "cpu_*" - just check prefix
metric_name.starts_with(prefix)
@@ -118,9 +138,9 @@ impl CacheConfig {
}
} else {
// More complex patterns - for now, just check if all parts are present
pattern_parts.iter().all(|part| {
part.is_empty() || metric_name.contains(part)
})
pattern_parts
.iter()
.all(|part| part.is_empty() || metric_name.contains(part))
}
} else {
metric_name == pattern
@@ -142,7 +162,7 @@ mod tests {
#[test]
fn test_pattern_matching() {
let config = CacheConfig::default();
assert!(config.matches_pattern("cpu_load_1min", "cpu_load_*"));
assert!(config.matches_pattern("service_nginx_disk_gb", "service_*_disk_gb"));
assert!(!config.matches_pattern("memory_usage_percent", "cpu_load_*"));
@@ -151,21 +171,21 @@ mod tests {
#[test]
fn test_tier_assignment() {
let config = CacheConfig::default();
// Realtime (2s) - CPU/Memory operations
assert_eq!(config.get_cache_interval("cpu_load_1min"), 2);
assert_eq!(config.get_cache_interval("memory_usage_percent"), 2);
assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 2);
// Disk light (60s) - Service status
assert_eq!(config.get_cache_interval("service_nginx_status"), 60);
// Disk medium (300s) - Disk usage
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 300);
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 300);
// Disk heavy (900s) - SMART data
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 900);
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 900);
// Disk light (10s) - Service status
assert_eq!(config.get_cache_interval("service_nginx_status"), 10);
// Disk medium (60s) - Disk usage
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 60);
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 60);
// Static (600s) - SMART data
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 600);
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 600);
}
}
}