Implement hysteresis for metric status changes to prevent flapping

Add comprehensive hysteresis support to prevent status oscillation near
threshold boundaries while maintaining responsive alerting.

Key Features:
- HysteresisThresholds with configurable upper/lower limits
- StatusTracker for per-metric status history
- Default gaps: CPU load 10%, memory 5%, disk temp 5°C

Updated Components:
- CPU load collector (5-minute average with hysteresis)
- Memory usage collector (percentage-based thresholds)
- Disk temperature collector (SMART data monitoring)
- All collectors updated to support StatusTracker interface

Cache Interval Adjustments:
- Service status: 60s → 10s (faster response)
- Disk usage: 300s → 60s (more frequent checks)
- Backup status: 900s → 60s (quicker updates)
- SMART data: moved to 600s tier (10 minutes)

Architecture:
- Individual metric status calculation in collectors
- Centralized StatusTracker in MetricCollectionManager
- Status aggregation preserved in dashboard widgets
This commit is contained in:
2025-10-20 18:45:41 +02:00
parent e998679901
commit 00a8ed3da2
34 changed files with 1037 additions and 770 deletions

View File

@@ -1,6 +1,6 @@
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use chrono::Utc;
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio::fs;
@@ -18,7 +18,8 @@ pub struct BackupCollector {
impl BackupCollector {
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
Self {
backup_status_file: backup_status_file.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
backup_status_file: backup_status_file
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
max_age_hours,
}
}
@@ -43,10 +44,16 @@ impl BackupCollector {
Ok(dt) => dt.with_timezone(&Utc),
Err(_) => {
// Try parsing as naive datetime and assume UTC
match chrono::NaiveDateTime::parse_from_str(&backup_status.start_time, "%Y-%m-%dT%H:%M:%S%.f") {
match chrono::NaiveDateTime::parse_from_str(
&backup_status.start_time,
"%Y-%m-%dT%H:%M:%S%.f",
) {
Ok(naive_dt) => naive_dt.and_utc(),
Err(_) => {
error!("Failed to parse backup timestamp: {}", backup_status.start_time);
error!(
"Failed to parse backup timestamp: {}",
backup_status.start_time
);
return Status::Unknown;
}
}
@@ -63,7 +70,7 @@ impl BackupCollector {
} else {
Status::Ok
}
},
}
"failed" => Status::Critical,
"running" => Status::Ok, // Currently running is OK
_ => Status::Unknown,
@@ -78,7 +85,7 @@ impl BackupCollector {
} else {
Status::Critical
}
},
}
"failed" => Status::Critical,
"disabled" => Status::Warning, // Service intentionally disabled
"running" => Status::Ok,
@@ -97,7 +104,7 @@ impl Collector for BackupCollector {
"backup"
}
async fn collect(&self) -> Result<Vec<Metric>, CollectorError> {
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
let backup_status = self.read_backup_status().await?;
let mut metrics = Vec::new();
let timestamp = chrono::Utc::now().timestamp() as u64;
@@ -114,7 +121,10 @@ impl Collector for BackupCollector {
}),
status: overall_status,
timestamp,
description: Some(format!("Backup: {} at {}", backup_status.status, backup_status.start_time)),
description: Some(format!(
"Backup: {} at {}",
backup_status.status, backup_status.start_time
)),
unit: None,
});
@@ -129,14 +139,18 @@ impl Collector for BackupCollector {
});
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
let last_updated_dt_result = chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
.map(|dt| dt.with_timezone(&Utc))
.or_else(|_| {
// Try parsing as naive datetime and assume UTC
chrono::NaiveDateTime::parse_from_str(&backup_status.last_updated, "%Y-%m-%dT%H:%M:%S%.f")
let last_updated_dt_result =
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
.map(|dt| dt.with_timezone(&Utc))
.or_else(|_| {
// Try parsing as naive datetime and assume UTC
chrono::NaiveDateTime::parse_from_str(
&backup_status.last_updated,
"%Y-%m-%dT%H:%M:%S%.f",
)
.map(|naive_dt| naive_dt.and_utc())
});
});
if let Ok(last_updated_dt) = last_updated_dt_result {
metrics.push(Metric {
name: "backup_last_run_timestamp".to_string(),
@@ -147,13 +161,16 @@ impl Collector for BackupCollector {
unit: Some("unix_timestamp".to_string()),
});
} else {
error!("Failed to parse backup timestamp for last_run_timestamp: {}", backup_status.last_updated);
error!(
"Failed to parse backup timestamp for last_run_timestamp: {}",
backup_status.last_updated
);
}
// Individual service metrics
for (service_name, service) in &backup_status.services {
let service_status = self.calculate_service_status(service);
// Service status
metrics.push(Metric {
name: format!("backup_service_{}_status", service_name),
@@ -165,7 +182,10 @@ impl Collector for BackupCollector {
}),
status: service_status,
timestamp,
description: Some(format!("Backup service {} status: {}", service_name, service.status)),
description: Some(format!(
"Backup service {} status: {}",
service_name, service.status
)),
unit: None,
});
@@ -173,7 +193,11 @@ impl Collector for BackupCollector {
metrics.push(Metric {
name: format!("backup_service_{}_exit_code", service_name),
value: MetricValue::Integer(service.exit_code),
status: if service.exit_code == 0 { Status::Ok } else { Status::Critical },
status: if service.exit_code == 0 {
Status::Ok
} else {
Status::Critical
},
timestamp,
description: Some(format!("Exit code for backup service {}", service_name)),
unit: None,
@@ -222,7 +246,9 @@ impl Collector for BackupCollector {
});
// Calculate total repository size
let total_size_bytes: u64 = backup_status.services.values()
let total_size_bytes: u64 = backup_status
.services
.values()
.map(|s| s.repo_size_bytes)
.sum();
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
@@ -301,7 +327,6 @@ impl Collector for BackupCollector {
unit: None,
});
}
}
// Add standalone disk identification metrics from TOML fields
@@ -372,7 +397,7 @@ pub struct DiskSpace {
pub used_gb: f64,
pub available_gb: f64,
pub usage_percent: f64,
// Optional disk identification fields
// Optional disk identification fields
pub product_name: Option<String>,
pub serial_number: Option<String>,
}
@@ -384,4 +409,4 @@ pub struct ServiceStatus {
pub repo_path: String,
pub archive_count: i64,
pub repo_size_bytes: u64,
}
}