Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -24,29 +24,47 @@ pub struct CacheConfig {
|
||||
impl Default for CacheConfig {
|
||||
fn default() -> Self {
|
||||
let mut tiers = HashMap::new();
|
||||
tiers.insert("realtime".to_string(), CacheTier {
|
||||
interval_seconds: 2,
|
||||
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_light".to_string(), CacheTier {
|
||||
interval_seconds: 60,
|
||||
description: "Light disk operations - 1 minute (service status checks)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_medium".to_string(), CacheTier {
|
||||
interval_seconds: 300,
|
||||
description: "Medium disk operations - 5 minutes (disk usage, service disk)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_heavy".to_string(), CacheTier {
|
||||
interval_seconds: 900,
|
||||
description: "Heavy disk operations - 15 minutes (SMART data, backup status)".to_string(),
|
||||
});
|
||||
tiers.insert("static".to_string(), CacheTier {
|
||||
interval_seconds: 3600,
|
||||
description: "Hardware info that rarely changes - 1 hour".to_string(),
|
||||
});
|
||||
tiers.insert(
|
||||
"realtime".to_string(),
|
||||
CacheTier {
|
||||
interval_seconds: 2,
|
||||
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)"
|
||||
.to_string(),
|
||||
},
|
||||
);
|
||||
tiers.insert(
|
||||
"disk_light".to_string(),
|
||||
CacheTier {
|
||||
interval_seconds: 10,
|
||||
description: "Light disk operations - 10 seconds (service status checks)".to_string(),
|
||||
},
|
||||
);
|
||||
tiers.insert(
|
||||
"disk_medium".to_string(),
|
||||
CacheTier {
|
||||
interval_seconds: 60,
|
||||
description: "Medium disk operations - 1 minute (disk usage, service disk)"
|
||||
.to_string(),
|
||||
},
|
||||
);
|
||||
tiers.insert(
|
||||
"disk_heavy".to_string(),
|
||||
CacheTier {
|
||||
interval_seconds: 60,
|
||||
description: "Heavy disk operations - 1 minute (backup status)"
|
||||
.to_string(),
|
||||
},
|
||||
);
|
||||
tiers.insert(
|
||||
"static".to_string(),
|
||||
CacheTier {
|
||||
interval_seconds: 600,
|
||||
description: "SMART data operations - 10 minutes".to_string(),
|
||||
},
|
||||
);
|
||||
|
||||
let mut metric_assignments = HashMap::new();
|
||||
|
||||
|
||||
// REALTIME (2s) - Memory/CPU operations, no disk I/O
|
||||
metric_assignments.insert("cpu_load_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("cpu_temperature_*".to_string(), "realtime".to_string());
|
||||
@@ -55,22 +73,24 @@ impl Default for CacheConfig {
|
||||
metric_assignments.insert("service_*_cpu_percent".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("service_*_memory_mb".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("network_*".to_string(), "realtime".to_string());
|
||||
|
||||
|
||||
// DISK_LIGHT (1min) - Light disk operations: service status checks
|
||||
metric_assignments.insert("service_*_status".to_string(), "disk_light".to_string());
|
||||
|
||||
|
||||
// DISK_MEDIUM (5min) - Medium disk operations: du commands, disk usage
|
||||
metric_assignments.insert("service_*_disk_gb".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_tmp_*".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
|
||||
|
||||
// DISK_HEAVY (15min) - Heavy disk operations: SMART data, backup status
|
||||
metric_assignments.insert("disk_*_temperature".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("disk_*_wear_percent".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("smart_*".to_string(), "disk_heavy".to_string());
|
||||
|
||||
// DISK_HEAVY (1min) - Heavy disk operations: backup status
|
||||
metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());
|
||||
|
||||
// STATIC (10min) - SMART data operations
|
||||
metric_assignments.insert("disk_*_temperature".to_string(), "static".to_string());
|
||||
metric_assignments.insert("disk_*_wear_percent".to_string(), "static".to_string());
|
||||
metric_assignments.insert("smart_*".to_string(), "static".to_string());
|
||||
|
||||
Self {
|
||||
enabled: true,
|
||||
default_ttl_seconds: 30,
|
||||
@@ -101,11 +121,11 @@ impl CacheConfig {
|
||||
if pattern.contains('*') {
|
||||
// Convert pattern to regex-like matching
|
||||
let pattern_parts: Vec<&str> = pattern.split('*').collect();
|
||||
|
||||
|
||||
if pattern_parts.len() == 2 {
|
||||
let prefix = pattern_parts[0];
|
||||
let suffix = pattern_parts[1];
|
||||
|
||||
|
||||
if suffix.is_empty() {
|
||||
// Pattern like "cpu_*" - just check prefix
|
||||
metric_name.starts_with(prefix)
|
||||
@@ -118,9 +138,9 @@ impl CacheConfig {
|
||||
}
|
||||
} else {
|
||||
// More complex patterns - for now, just check if all parts are present
|
||||
pattern_parts.iter().all(|part| {
|
||||
part.is_empty() || metric_name.contains(part)
|
||||
})
|
||||
pattern_parts
|
||||
.iter()
|
||||
.all(|part| part.is_empty() || metric_name.contains(part))
|
||||
}
|
||||
} else {
|
||||
metric_name == pattern
|
||||
@@ -142,7 +162,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_pattern_matching() {
|
||||
let config = CacheConfig::default();
|
||||
|
||||
|
||||
assert!(config.matches_pattern("cpu_load_1min", "cpu_load_*"));
|
||||
assert!(config.matches_pattern("service_nginx_disk_gb", "service_*_disk_gb"));
|
||||
assert!(!config.matches_pattern("memory_usage_percent", "cpu_load_*"));
|
||||
@@ -151,21 +171,21 @@ mod tests {
|
||||
#[test]
|
||||
fn test_tier_assignment() {
|
||||
let config = CacheConfig::default();
|
||||
|
||||
|
||||
// Realtime (2s) - CPU/Memory operations
|
||||
assert_eq!(config.get_cache_interval("cpu_load_1min"), 2);
|
||||
assert_eq!(config.get_cache_interval("memory_usage_percent"), 2);
|
||||
assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 2);
|
||||
|
||||
// Disk light (60s) - Service status
|
||||
assert_eq!(config.get_cache_interval("service_nginx_status"), 60);
|
||||
|
||||
// Disk medium (300s) - Disk usage
|
||||
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 300);
|
||||
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 300);
|
||||
|
||||
// Disk heavy (900s) - SMART data
|
||||
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 900);
|
||||
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 900);
|
||||
|
||||
// Disk light (10s) - Service status
|
||||
assert_eq!(config.get_cache_interval("service_nginx_status"), 10);
|
||||
|
||||
// Disk medium (60s) - Disk usage
|
||||
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 60);
|
||||
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 60);
|
||||
|
||||
// Static (600s) - SMART data
|
||||
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 600);
|
||||
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 600);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user