Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -1,18 +1,19 @@
|
||||
use anyhow::{Context, Result};
|
||||
use std::path::Path;
|
||||
use std::fs;
|
||||
use crate::config::AgentConfig;
|
||||
use anyhow::{Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
pub fn load_config<P: AsRef<Path>>(path: P) -> Result<AgentConfig> {
|
||||
let path = path.as_ref();
|
||||
let content = fs::read_to_string(path)
|
||||
.with_context(|| format!("Failed to read config file: {}", path.display()))?;
|
||||
|
||||
|
||||
let config: AgentConfig = toml::from_str(&content)
|
||||
.with_context(|| format!("Failed to parse config file: {}", path.display()))?;
|
||||
|
||||
config.validate()
|
||||
|
||||
config
|
||||
.validate()
|
||||
.with_context(|| format!("Invalid configuration in file: {}", path.display()))?;
|
||||
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::CacheConfig;
|
||||
use gethostname::gethostname;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::Path;
|
||||
|
||||
|
||||
@@ -1,114 +1,126 @@
|
||||
use anyhow::{bail, Result};
|
||||
use crate::config::AgentConfig;
|
||||
use anyhow::{bail, Result};
|
||||
|
||||
pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
||||
// Validate ZMQ configuration
|
||||
if config.zmq.publisher_port == 0 {
|
||||
bail!("ZMQ publisher port cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
if config.zmq.command_port == 0 {
|
||||
bail!("ZMQ command port cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
if config.zmq.publisher_port == config.zmq.command_port {
|
||||
bail!("ZMQ publisher and command ports cannot be the same");
|
||||
}
|
||||
|
||||
|
||||
if config.zmq.bind_address.is_empty() {
|
||||
bail!("ZMQ bind address cannot be empty");
|
||||
}
|
||||
|
||||
|
||||
if config.zmq.timeout_ms == 0 {
|
||||
bail!("ZMQ timeout cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
// Validate collection interval
|
||||
if config.collection_interval_seconds == 0 {
|
||||
bail!("Collection interval cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
// Validate CPU thresholds
|
||||
if config.collectors.cpu.enabled {
|
||||
if config.collectors.cpu.load_warning_threshold <= 0.0 {
|
||||
bail!("CPU load warning threshold must be positive");
|
||||
}
|
||||
|
||||
if config.collectors.cpu.load_critical_threshold <= config.collectors.cpu.load_warning_threshold {
|
||||
|
||||
if config.collectors.cpu.load_critical_threshold
|
||||
<= config.collectors.cpu.load_warning_threshold
|
||||
{
|
||||
bail!("CPU load critical threshold must be greater than warning threshold");
|
||||
}
|
||||
|
||||
|
||||
if config.collectors.cpu.temperature_warning_threshold <= 0.0 {
|
||||
bail!("CPU temperature warning threshold must be positive");
|
||||
}
|
||||
|
||||
if config.collectors.cpu.temperature_critical_threshold <= config.collectors.cpu.temperature_warning_threshold {
|
||||
|
||||
if config.collectors.cpu.temperature_critical_threshold
|
||||
<= config.collectors.cpu.temperature_warning_threshold
|
||||
{
|
||||
bail!("CPU temperature critical threshold must be greater than warning threshold");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate memory thresholds
|
||||
if config.collectors.memory.enabled {
|
||||
if config.collectors.memory.usage_warning_percent <= 0.0 || config.collectors.memory.usage_warning_percent > 100.0 {
|
||||
if config.collectors.memory.usage_warning_percent <= 0.0
|
||||
|| config.collectors.memory.usage_warning_percent > 100.0
|
||||
{
|
||||
bail!("Memory usage warning threshold must be between 0 and 100");
|
||||
}
|
||||
|
||||
if config.collectors.memory.usage_critical_percent <= config.collectors.memory.usage_warning_percent
|
||||
|| config.collectors.memory.usage_critical_percent > 100.0 {
|
||||
|
||||
if config.collectors.memory.usage_critical_percent
|
||||
<= config.collectors.memory.usage_warning_percent
|
||||
|| config.collectors.memory.usage_critical_percent > 100.0
|
||||
{
|
||||
bail!("Memory usage critical threshold must be between warning threshold and 100");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate disk thresholds
|
||||
if config.collectors.disk.enabled {
|
||||
if config.collectors.disk.usage_warning_percent <= 0.0 || config.collectors.disk.usage_warning_percent > 100.0 {
|
||||
if config.collectors.disk.usage_warning_percent <= 0.0
|
||||
|| config.collectors.disk.usage_warning_percent > 100.0
|
||||
{
|
||||
bail!("Disk usage warning threshold must be between 0 and 100");
|
||||
}
|
||||
|
||||
if config.collectors.disk.usage_critical_percent <= config.collectors.disk.usage_warning_percent
|
||||
|| config.collectors.disk.usage_critical_percent > 100.0 {
|
||||
|
||||
if config.collectors.disk.usage_critical_percent
|
||||
<= config.collectors.disk.usage_warning_percent
|
||||
|| config.collectors.disk.usage_critical_percent > 100.0
|
||||
{
|
||||
bail!("Disk usage critical threshold must be between warning threshold and 100");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate SMTP configuration
|
||||
if config.notifications.enabled {
|
||||
if config.notifications.smtp_host.is_empty() {
|
||||
bail!("SMTP host cannot be empty when notifications are enabled");
|
||||
}
|
||||
|
||||
|
||||
if config.notifications.smtp_port == 0 {
|
||||
bail!("SMTP port cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
if config.notifications.from_email.is_empty() {
|
||||
bail!("From email cannot be empty when notifications are enabled");
|
||||
}
|
||||
|
||||
|
||||
if config.notifications.to_email.is_empty() {
|
||||
bail!("To email cannot be empty when notifications are enabled");
|
||||
}
|
||||
|
||||
|
||||
// Basic email validation
|
||||
if !config.notifications.from_email.contains('@') {
|
||||
bail!("From email must contain @ symbol");
|
||||
}
|
||||
|
||||
|
||||
if !config.notifications.to_email.contains('@') {
|
||||
bail!("To email must contain @ symbol");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Validate cache configuration
|
||||
if config.cache.enabled {
|
||||
if config.cache.default_ttl_seconds == 0 {
|
||||
bail!("Cache TTL cannot be 0");
|
||||
}
|
||||
|
||||
|
||||
if config.cache.max_entries == 0 {
|
||||
bail!("Cache max entries cannot be 0");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user