Implement hysteresis for metric status changes to prevent flapping
Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status};
|
||||
use chrono::Utc;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use tokio::fs;
|
||||
@@ -18,7 +18,8 @@ pub struct BackupCollector {
|
||||
impl BackupCollector {
|
||||
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
||||
Self {
|
||||
backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||
backup_status_file: backup_status_file
|
||||
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||
max_age_hours,
|
||||
}
|
||||
}
|
||||
@@ -43,10 +44,16 @@ impl BackupCollector {
|
||||
Ok(dt) => dt.with_timezone(&Utc),
|
||||
Err(_) => {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
|
||||
match chrono::NaiveDateTime::parse_from_str(
|
||||
&backup_status.start_time,
|
||||
"%Y-%m-%dT%H:%M:%S%.f",
|
||||
) {
|
||||
Ok(naive_dt) => naive_dt.and_utc(),
|
||||
Err(_) => {
|
||||
error!("Failed to parse backup timestamp: {}", backup_status.start_time);
|
||||
error!(
|
||||
"Failed to parse backup timestamp: {}",
|
||||
backup_status.start_time
|
||||
);
|
||||
return Status::Unknown;
|
||||
}
|
||||
}
|
||||
@@ -63,7 +70,7 @@ impl BackupCollector {
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
},
|
||||
}
|
||||
"failed" => Status::Critical,
|
||||
"running" => Status::Ok, // Currently running is OK
|
||||
_ => Status::Unknown,
|
||||
@@ -78,7 +85,7 @@ impl BackupCollector {
|
||||
} else {
|
||||
Status::Critical
|
||||
}
|
||||
},
|
||||
}
|
||||
"failed" => Status::Critical,
|
||||
"disabled" => Status::Warning, // Service intentionally disabled
|
||||
"running" => Status::Ok,
|
||||
@@ -97,7 +104,7 @@ impl Collector for BackupCollector {
|
||||
"backup"
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let backup_status = self.read_backup_status().await?;
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
@@ -114,7 +121,10 @@ impl Collector for BackupCollector {
|
||||
}),
|
||||
status: overall_status,
|
||||
timestamp,
|
||||
description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
|
||||
description: Some(format!(
|
||||
"Backup: {} at {}",
|
||||
backup_status.status, backup_status.start_time
|
||||
)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
@@ -129,14 +139,18 @@ impl Collector for BackupCollector {
|
||||
});
|
||||
|
||||
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
||||
let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
.or_else(|_| {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
|
||||
let last_updated_dt_result =
|
||||
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
.or_else(|_| {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
chrono::NaiveDateTime::parse_from_str(
|
||||
&backup_status.last_updated,
|
||||
"%Y-%m-%dT%H:%M:%S%.f",
|
||||
)
|
||||
.map(|naive_dt| naive_dt.and_utc())
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
if let Ok(last_updated_dt) = last_updated_dt_result {
|
||||
metrics.push(Metric {
|
||||
name: "backup_last_run_timestamp".to_string(),
|
||||
@@ -147,13 +161,16 @@ impl Collector for BackupCollector {
|
||||
unit: Some("unix_timestamp".to_string()),
|
||||
});
|
||||
} else {
|
||||
error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
|
||||
error!(
|
||||
"Failed to parse backup timestamp for last_run_timestamp: {}",
|
||||
backup_status.last_updated
|
||||
);
|
||||
}
|
||||
|
||||
// Individual service metrics
|
||||
for (service_name, service) in &backup_status.services {
|
||||
let service_status = self.calculate_service_status(service);
|
||||
|
||||
|
||||
// Service status
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_status", service_name),
|
||||
@@ -165,7 +182,10 @@ impl Collector for BackupCollector {
|
||||
}),
|
||||
status: service_status,
|
||||
timestamp,
|
||||
description: Some(format!("Backup service {} status: {}", service_name, service.status)),
|
||||
description: Some(format!(
|
||||
"Backup service {} status: {}",
|
||||
service_name, service.status
|
||||
)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
@@ -173,7 +193,11 @@ impl Collector for BackupCollector {
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_exit_code", service_name),
|
||||
value: MetricValue::Integer(service.exit_code),
|
||||
status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
|
||||
status: if service.exit_code == 0 {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Critical
|
||||
},
|
||||
timestamp,
|
||||
description: Some(format!("Exit code for backup service {}", service_name)),
|
||||
unit: None,
|
||||
@@ -222,7 +246,9 @@ impl Collector for BackupCollector {
|
||||
});
|
||||
|
||||
// Calculate total repository size
|
||||
let total_size_bytes: u64 = backup_status.services.values()
|
||||
let total_size_bytes: u64 = backup_status
|
||||
.services
|
||||
.values()
|
||||
.map(|s| s.repo_size_bytes)
|
||||
.sum();
|
||||
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
||||
@@ -301,7 +327,6 @@ impl Collector for BackupCollector {
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Add standalone disk identification metrics from TOML fields
|
||||
@@ -372,7 +397,7 @@ pub struct DiskSpace {
|
||||
pub used_gb: f64,
|
||||
pub available_gb: f64,
|
||||
pub usage_percent: f64,
|
||||
// Optional disk identification fields
|
||||
// Optional disk identification fields
|
||||
pub product_name: Option<String>,
|
||||
pub serial_number: Option<String>,
|
||||
}
|
||||
@@ -384,4 +409,4 @@ pub struct ServiceStatus {
|
||||
pub repo_path: String,
|
||||
pub archive_count: i64,
|
||||
pub repo_size_bytes: u64,
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user