Fix notification system with proper rate limiting and aggregation
All checks were successful
Build and Release / build-and-release (push) Successful in 1m51s
All checks were successful
Build and Release / build-and-release (push) Successful in 1m51s
- Add rate limiting using rate_limit_minutes config (was ignored) - Add aggregation using aggregation_interval_seconds config (was ignored) - Use smtp_host and smtp_port from config (was hardcoded localhost:25) - Add trigger_on_warnings and trigger_on_failures config options - Add recovery_requires_all_ok and suppress_individual_recoveries - Use check_interval_seconds from config (was hardcoded 30s) - Expand status tracking to all components (drives, pools, services, backup) - Move notification checks from every collection to dedicated interval - Separate alert and recovery notifications with proper email formatting - Only notify on failed services (Critical), not inactive
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use gethostname::gethostname;
|
||||
use std::collections::HashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::interval;
|
||||
use tracing::{debug, error, info};
|
||||
@@ -28,7 +29,6 @@ struct TimedCollector {
|
||||
}
|
||||
|
||||
pub struct Agent {
|
||||
hostname: String,
|
||||
config: AgentConfig,
|
||||
zmq_handler: ZmqHandler,
|
||||
collectors: Vec<TimedCollector>,
|
||||
@@ -38,12 +38,40 @@ pub struct Agent {
|
||||
}
|
||||
|
||||
/// Track system component status for change detection
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct SystemStatus {
|
||||
// CPU
|
||||
cpu_load_status: cm_dashboard_shared::Status,
|
||||
cpu_temperature_status: cm_dashboard_shared::Status,
|
||||
// Memory
|
||||
memory_usage_status: cm_dashboard_shared::Status,
|
||||
// Add more as needed
|
||||
// Storage - keyed by drive name or pool name
|
||||
drive_statuses: HashMap<String, DriveStatus>,
|
||||
pool_statuses: HashMap<String, PoolStatus>,
|
||||
// Services - keyed by service name
|
||||
service_statuses: HashMap<String, cm_dashboard_shared::Status>,
|
||||
// Backup
|
||||
backup_status: cm_dashboard_shared::Status,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct DriveStatus {
|
||||
temperature_status: cm_dashboard_shared::Status,
|
||||
health_status: cm_dashboard_shared::Status,
|
||||
filesystem_statuses: HashMap<String, cm_dashboard_shared::Status>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct PoolStatus {
|
||||
health_status: cm_dashboard_shared::Status,
|
||||
usage_status: cm_dashboard_shared::Status,
|
||||
drive_statuses: HashMap<String, PoolDriveStatus>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct PoolDriveStatus {
|
||||
health_status: cm_dashboard_shared::Status,
|
||||
temperature_status: cm_dashboard_shared::Status,
|
||||
}
|
||||
|
||||
impl Agent {
|
||||
@@ -148,7 +176,6 @@ impl Agent {
|
||||
let cached_agent_data = AgentData::new(hostname.clone(), env!("CARGO_PKG_VERSION").to_string());
|
||||
|
||||
Ok(Self {
|
||||
hostname,
|
||||
config,
|
||||
zmq_handler,
|
||||
collectors,
|
||||
@@ -171,7 +198,9 @@ impl Agent {
|
||||
let mut transmission_interval = interval(Duration::from_secs(
|
||||
self.config.zmq.transmission_interval_seconds,
|
||||
));
|
||||
let mut notification_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
|
||||
let mut notification_interval = interval(Duration::from_secs(
|
||||
self.config.notifications.check_interval_seconds,
|
||||
));
|
||||
|
||||
// Skip initial ticks to avoid immediate execution
|
||||
transmission_interval.tick().await;
|
||||
@@ -185,9 +214,21 @@ impl Agent {
|
||||
}
|
||||
}
|
||||
_ = notification_interval.tick() => {
|
||||
// Process any pending notifications
|
||||
// NOTE: With structured data, we might need to implement status tracking differently
|
||||
// For now, we skip this until status evaluation is migrated
|
||||
// Check for status changes and queue notifications
|
||||
let agent_data_snapshot = self.cached_agent_data.clone();
|
||||
if let Err(e) = self.check_status_changes_and_notify(&agent_data_snapshot).await {
|
||||
error!("Failed to check status changes: {}", e);
|
||||
}
|
||||
|
||||
// Check if all components recovered and flush pending recoveries
|
||||
self.notification_manager.flush_recoveries_if_all_ok();
|
||||
|
||||
// Flush any pending aggregated notifications
|
||||
if self.notification_manager.should_flush() {
|
||||
if let Err(e) = self.notification_manager.flush_notifications().await {
|
||||
error!("Failed to flush notifications: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = &mut shutdown_rx => {
|
||||
info!("Shutdown signal received, stopping agent loop");
|
||||
@@ -235,16 +276,8 @@ impl Agent {
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
// Clone for notification check (to avoid borrow issues)
|
||||
let agent_data_snapshot = self.cached_agent_data.clone();
|
||||
|
||||
// Check for status changes and send notifications
|
||||
if let Err(e) = self.check_status_changes_and_notify(&agent_data_snapshot).await {
|
||||
error!("Failed to check status changes: {}", e);
|
||||
}
|
||||
|
||||
// Broadcast the cached structured data via ZMQ
|
||||
if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data_snapshot).await {
|
||||
if let Err(e) = self.zmq_handler.publish_agent_data(&self.cached_agent_data).await {
|
||||
error!("Failed to broadcast agent data: {}", e);
|
||||
} else {
|
||||
debug!("Successfully broadcast structured agent data");
|
||||
@@ -253,38 +286,182 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check for status changes and send notifications
|
||||
/// Check for status changes and queue notifications
|
||||
async fn check_status_changes_and_notify(&mut self, agent_data: &AgentData) -> Result<()> {
|
||||
// Extract current status
|
||||
let current_status = SystemStatus {
|
||||
cpu_load_status: agent_data.system.cpu.load_status.clone(),
|
||||
cpu_temperature_status: agent_data.system.cpu.temperature_status.clone(),
|
||||
memory_usage_status: agent_data.system.memory.usage_status.clone(),
|
||||
// Build current status from agent data
|
||||
let mut current_status = SystemStatus {
|
||||
cpu_load_status: agent_data.system.cpu.load_status,
|
||||
cpu_temperature_status: agent_data.system.cpu.temperature_status,
|
||||
memory_usage_status: agent_data.system.memory.usage_status,
|
||||
backup_status: agent_data.backup.backup_status,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Check for status changes
|
||||
if let Some(previous) = self.previous_status.clone() {
|
||||
self.check_and_notify_status_change(
|
||||
// Collect drive statuses
|
||||
for drive in &agent_data.system.storage.drives {
|
||||
let mut fs_statuses = HashMap::new();
|
||||
for fs in &drive.filesystems {
|
||||
fs_statuses.insert(fs.mount.clone(), fs.usage_status);
|
||||
}
|
||||
current_status.drive_statuses.insert(
|
||||
drive.name.clone(),
|
||||
DriveStatus {
|
||||
temperature_status: drive.temperature_status,
|
||||
health_status: drive.health_status,
|
||||
filesystem_statuses: fs_statuses,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Collect pool statuses
|
||||
for pool in &agent_data.system.storage.pools {
|
||||
let mut pool_drive_statuses = HashMap::new();
|
||||
for drive in pool.data_drives.iter().chain(pool.parity_drives.iter()) {
|
||||
pool_drive_statuses.insert(
|
||||
drive.name.clone(),
|
||||
PoolDriveStatus {
|
||||
health_status: drive.health_status,
|
||||
temperature_status: drive.temperature_status,
|
||||
},
|
||||
);
|
||||
}
|
||||
current_status.pool_statuses.insert(
|
||||
pool.name.clone(),
|
||||
PoolStatus {
|
||||
health_status: pool.health_status,
|
||||
usage_status: pool.usage_status,
|
||||
drive_statuses: pool_drive_statuses,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Collect service statuses (only for non-user-stopped services)
|
||||
for service in &agent_data.services {
|
||||
if !service.user_stopped {
|
||||
current_status
|
||||
.service_statuses
|
||||
.insert(service.name.clone(), service.service_status);
|
||||
}
|
||||
}
|
||||
|
||||
// Clone previous status to avoid borrow issues
|
||||
let previous = self.previous_status.clone();
|
||||
|
||||
// Compare with previous status and queue notifications
|
||||
if let Some(previous) = previous {
|
||||
// CPU
|
||||
self.queue_status_notification(
|
||||
"CPU Load",
|
||||
&previous.cpu_load_status,
|
||||
¤t_status.cpu_load_status,
|
||||
format!("CPU load: {:.1}", agent_data.system.cpu.load_1min)
|
||||
).await?;
|
||||
|
||||
self.check_and_notify_status_change(
|
||||
"CPU Temperature",
|
||||
&format!("Load: {:.2}", agent_data.system.cpu.load_1min),
|
||||
);
|
||||
self.queue_status_notification(
|
||||
"CPU Temperature",
|
||||
&previous.cpu_temperature_status,
|
||||
¤t_status.cpu_temperature_status,
|
||||
format!("CPU temperature: {}°C",
|
||||
agent_data.system.cpu.temperature_celsius.unwrap_or(0.0) as i32)
|
||||
).await?;
|
||||
&format!(
|
||||
"Temperature: {}°C",
|
||||
agent_data.system.cpu.temperature_celsius.unwrap_or(0.0) as i32
|
||||
),
|
||||
);
|
||||
|
||||
self.check_and_notify_status_change(
|
||||
"Memory Usage",
|
||||
&previous.memory_usage_status,
|
||||
// Memory
|
||||
self.queue_status_notification(
|
||||
"Memory",
|
||||
&previous.memory_usage_status,
|
||||
¤t_status.memory_usage_status,
|
||||
format!("Memory usage: {:.1}%", agent_data.system.memory.usage_percent)
|
||||
).await?;
|
||||
&format!("Usage: {:.1}%", agent_data.system.memory.usage_percent),
|
||||
);
|
||||
|
||||
// Backup
|
||||
self.queue_status_notification(
|
||||
"Backup",
|
||||
&previous.backup_status,
|
||||
¤t_status.backup_status,
|
||||
&format!(
|
||||
"Last backup: {}",
|
||||
agent_data.backup.last_backup_time.as_deref().unwrap_or("unknown")
|
||||
),
|
||||
);
|
||||
|
||||
// Drives
|
||||
for (name, current_drive) in ¤t_status.drive_statuses {
|
||||
if let Some(prev_drive) = previous.drive_statuses.get(name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Drive {} Health", name),
|
||||
&prev_drive.health_status,
|
||||
¤t_drive.health_status,
|
||||
"Health check failed",
|
||||
);
|
||||
self.queue_status_notification(
|
||||
&format!("Drive {} Temperature", name),
|
||||
&prev_drive.temperature_status,
|
||||
¤t_drive.temperature_status,
|
||||
"Temperature threshold exceeded",
|
||||
);
|
||||
|
||||
// Filesystem usage
|
||||
for (mount, current_fs_status) in ¤t_drive.filesystem_statuses {
|
||||
if let Some(prev_fs_status) = prev_drive.filesystem_statuses.get(mount) {
|
||||
self.queue_status_notification(
|
||||
&format!("Filesystem {}", mount),
|
||||
prev_fs_status,
|
||||
current_fs_status,
|
||||
"Disk usage threshold exceeded",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pools
|
||||
for (name, current_pool) in ¤t_status.pool_statuses {
|
||||
if let Some(prev_pool) = previous.pool_statuses.get(name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Health", name),
|
||||
&prev_pool.health_status,
|
||||
¤t_pool.health_status,
|
||||
"Pool health degraded",
|
||||
);
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Usage", name),
|
||||
&prev_pool.usage_status,
|
||||
¤t_pool.usage_status,
|
||||
"Pool usage threshold exceeded",
|
||||
);
|
||||
|
||||
// Pool drives
|
||||
for (drive_name, current_pd) in ¤t_pool.drive_statuses {
|
||||
if let Some(prev_pd) = prev_pool.drive_statuses.get(drive_name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Drive {} Health", name, drive_name),
|
||||
&prev_pd.health_status,
|
||||
¤t_pd.health_status,
|
||||
"Pool drive health degraded",
|
||||
);
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Drive {} Temperature", name, drive_name),
|
||||
&prev_pd.temperature_status,
|
||||
¤t_pd.temperature_status,
|
||||
"Pool drive temperature exceeded",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Services
|
||||
for (name, current_svc_status) in ¤t_status.service_statuses {
|
||||
if let Some(prev_svc_status) = previous.service_statuses.get(name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Service {}", name),
|
||||
prev_svc_status,
|
||||
current_svc_status,
|
||||
"Service status changed",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store current status for next comparison
|
||||
@@ -292,43 +469,44 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check individual status change and send notification if degraded
|
||||
async fn check_and_notify_status_change(
|
||||
/// Queue a notification based on status change
|
||||
fn queue_status_notification(
|
||||
&mut self,
|
||||
component: &str,
|
||||
previous: &cm_dashboard_shared::Status,
|
||||
current: &cm_dashboard_shared::Status,
|
||||
details: String
|
||||
) -> Result<()> {
|
||||
details: &str,
|
||||
) {
|
||||
use cm_dashboard_shared::Status;
|
||||
|
||||
// Only notify on status degradation (OK → Warning/Critical, Warning → Critical)
|
||||
let should_notify = match (previous, current) {
|
||||
(Status::Ok, Status::Warning) => true,
|
||||
(Status::Ok, Status::Critical) => true,
|
||||
(Status::Warning, Status::Critical) => true,
|
||||
_ => false,
|
||||
};
|
||||
// Check for degradation (alert)
|
||||
let is_alert = matches!(
|
||||
(previous, current),
|
||||
(Status::Ok, Status::Warning)
|
||||
| (Status::Ok, Status::Critical)
|
||||
| (Status::Warning, Status::Critical)
|
||||
);
|
||||
|
||||
if should_notify {
|
||||
let subject = format!("{} {} Alert", self.hostname, component);
|
||||
let body = format!(
|
||||
"Alert: {} status changed from {:?} to {:?}\n\nDetails: {}\n\nTime: {}",
|
||||
component,
|
||||
previous,
|
||||
current,
|
||||
details,
|
||||
chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
|
||||
// Check for recovery
|
||||
let is_recovery = matches!(
|
||||
(previous, current),
|
||||
(Status::Warning, Status::Ok)
|
||||
| (Status::Critical, Status::Ok)
|
||||
| (Status::Critical, Status::Warning)
|
||||
);
|
||||
|
||||
if is_alert {
|
||||
info!(
|
||||
"Alert: {} - {:?} → {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
|
||||
info!("Sending notification: {} - {:?} → {:?}", component, previous, current);
|
||||
|
||||
if let Err(e) = self.notification_manager.send_direct_email(&subject, &body).await {
|
||||
error!("Failed to send notification for {}: {}", component, e);
|
||||
}
|
||||
self.notification_manager.queue_alert(component, previous, current, details);
|
||||
} else if is_recovery {
|
||||
info!(
|
||||
"Recovery: {} - {:?} → {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
self.notification_manager.queue_recovery(component, previous, current, details);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
@@ -953,15 +953,21 @@ impl SystemdCollector {
|
||||
"-s",
|
||||
"--max-time",
|
||||
"4",
|
||||
"https://ifconfig.me"
|
||||
"https://1.1.1.1/cdn-cgi/trace"
|
||||
])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
if output.status.success() {
|
||||
let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||
if !ip.is_empty() && ip.contains('.') {
|
||||
return Some(ip);
|
||||
let response = String::from_utf8_lossy(&output.stdout);
|
||||
// Parse "ip=x.x.x.x" from the response
|
||||
for line in response.lines() {
|
||||
if let Some(ip) = line.strip_prefix("ip=") {
|
||||
let ip = ip.trim().to_string();
|
||||
if !ip.is_empty() {
|
||||
return Some(ip);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -141,8 +141,23 @@ pub struct NotificationConfig {
|
||||
pub from_email: String,
|
||||
pub to_email: String,
|
||||
pub rate_limit_minutes: u64,
|
||||
/// Whether to send notifications on warning status
|
||||
#[serde(default = "default_true")]
|
||||
pub trigger_on_warnings: bool,
|
||||
/// Whether to send notifications on failure/critical status
|
||||
#[serde(default = "default_true")]
|
||||
pub trigger_on_failures: bool,
|
||||
/// Only send recovery notification when all components are OK
|
||||
#[serde(default)]
|
||||
pub recovery_requires_all_ok: bool,
|
||||
/// Suppress individual recovery notifications (only notify on full recovery)
|
||||
#[serde(default)]
|
||||
pub suppress_individual_recoveries: bool,
|
||||
/// Email notification batching interval in seconds (default: 60)
|
||||
pub aggregation_interval_seconds: u64,
|
||||
/// How often to check for status changes in seconds (default: 30)
|
||||
#[serde(default = "default_check_interval_seconds")]
|
||||
pub check_interval_seconds: u64,
|
||||
/// List of metric names to exclude from email notifications
|
||||
#[serde(default)]
|
||||
pub exclude_email_metrics: Vec<String>,
|
||||
@@ -151,6 +166,14 @@ pub struct NotificationConfig {
|
||||
pub maintenance_mode_file: String,
|
||||
}
|
||||
|
||||
fn default_true() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn default_check_interval_seconds() -> u64 {
|
||||
30
|
||||
}
|
||||
|
||||
|
||||
fn default_heartbeat_interval_seconds() -> u64 {
|
||||
5
|
||||
|
||||
@@ -1,60 +1,314 @@
|
||||
use crate::config::NotificationConfig;
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use cm_dashboard_shared::Status;
|
||||
use lettre::transport::smtp::SmtpTransport;
|
||||
use lettre::{Message, Transport};
|
||||
use std::collections::HashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
/// Manages notifications
|
||||
/// Manages notifications with rate limiting and aggregation
|
||||
pub struct NotificationManager {
|
||||
config: NotificationConfig,
|
||||
/// Last notification time per component for rate limiting
|
||||
last_notification: HashMap<String, Instant>,
|
||||
/// Pending notifications for aggregation
|
||||
pending_notifications: Vec<PendingNotification>,
|
||||
/// Pending recovery notifications (held until all OK if configured)
|
||||
pending_recoveries: Vec<PendingNotification>,
|
||||
/// Last aggregation flush time
|
||||
last_aggregation_flush: Option<Instant>,
|
||||
/// Track components currently in alert state
|
||||
components_in_alert: HashMap<String, Status>,
|
||||
}
|
||||
|
||||
/// A pending notification waiting to be aggregated
|
||||
#[derive(Debug, Clone)]
|
||||
struct PendingNotification {
|
||||
component: String,
|
||||
previous_status: String,
|
||||
current_status: String,
|
||||
details: String,
|
||||
timestamp: chrono::DateTime<Utc>,
|
||||
is_recovery: bool,
|
||||
}
|
||||
|
||||
impl NotificationManager {
|
||||
pub fn new(config: &NotificationConfig, _hostname: &str) -> Result<Self> {
|
||||
Ok(Self {
|
||||
config: config.clone(),
|
||||
last_notification: HashMap::new(),
|
||||
pending_notifications: Vec::new(),
|
||||
pending_recoveries: Vec::new(),
|
||||
last_aggregation_flush: None,
|
||||
components_in_alert: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn send_direct_email(&mut self, subject: &str, body: &str) -> Result<()> {
|
||||
/// Check if a component is rate limited
|
||||
fn is_rate_limited(&self, component: &str) -> bool {
|
||||
if self.config.rate_limit_minutes == 0 {
|
||||
return false;
|
||||
}
|
||||
if let Some(last_time) = self.last_notification.get(component) {
|
||||
let rate_limit = Duration::from_secs(self.config.rate_limit_minutes * 60);
|
||||
last_time.elapsed() < rate_limit
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Queue a degradation notification (Ok→Warning, Ok→Critical, Warning→Critical)
|
||||
pub fn queue_alert(
|
||||
&mut self,
|
||||
component: &str,
|
||||
previous: &Status,
|
||||
current: &Status,
|
||||
details: &str,
|
||||
) {
|
||||
// Check if this status type should trigger notifications
|
||||
// Only Warning and Critical trigger notifications (not Inactive)
|
||||
let should_notify = match current {
|
||||
Status::Warning => self.config.trigger_on_warnings,
|
||||
Status::Critical => self.config.trigger_on_failures,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
if !should_notify {
|
||||
debug!(
|
||||
"Notification for {} suppressed (trigger_on_warnings={}, trigger_on_failures={})",
|
||||
component, self.config.trigger_on_warnings, self.config.trigger_on_failures
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check rate limit
|
||||
if self.is_rate_limited(component) {
|
||||
debug!(
|
||||
"Notification for {} rate limited (limit: {} min)",
|
||||
component, self.config.rate_limit_minutes
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check exclusions
|
||||
if self.config.exclude_email_metrics.iter().any(|e| component.contains(e)) {
|
||||
debug!("Notification for {} excluded by config", component);
|
||||
return;
|
||||
}
|
||||
|
||||
// Track this component as in alert state
|
||||
self.components_in_alert.insert(component.to_string(), *current);
|
||||
|
||||
self.pending_notifications.push(PendingNotification {
|
||||
component: component.to_string(),
|
||||
previous_status: format!("{:?}", previous),
|
||||
current_status: format!("{:?}", current),
|
||||
details: details.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
is_recovery: false,
|
||||
});
|
||||
|
||||
// Update rate limit tracker
|
||||
self.last_notification.insert(component.to_string(), Instant::now());
|
||||
|
||||
debug!(
|
||||
"Queued alert for {}: {:?} -> {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
}
|
||||
|
||||
/// Queue a recovery notification (Warning→Ok, Critical→Ok, Critical→Warning)
|
||||
pub fn queue_recovery(
|
||||
&mut self,
|
||||
component: &str,
|
||||
previous: &Status,
|
||||
current: &Status,
|
||||
details: &str,
|
||||
) {
|
||||
// Remove from alert tracking
|
||||
self.components_in_alert.remove(component);
|
||||
|
||||
// Check if individual recoveries are suppressed
|
||||
if self.config.suppress_individual_recoveries {
|
||||
debug!(
|
||||
"Individual recovery for {} suppressed by config",
|
||||
component
|
||||
);
|
||||
|
||||
// Store recovery for potential batch notification
|
||||
self.pending_recoveries.push(PendingNotification {
|
||||
component: component.to_string(),
|
||||
previous_status: format!("{:?}", previous),
|
||||
current_status: format!("{:?}", current),
|
||||
details: details.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
is_recovery: true,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Check exclusions
|
||||
if self.config.exclude_email_metrics.iter().any(|e| component.contains(e)) {
|
||||
debug!("Recovery notification for {} excluded by config", component);
|
||||
return;
|
||||
}
|
||||
|
||||
self.pending_notifications.push(PendingNotification {
|
||||
component: component.to_string(),
|
||||
previous_status: format!("{:?}", previous),
|
||||
current_status: format!("{:?}", current),
|
||||
details: details.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
is_recovery: true,
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Queued recovery for {}: {:?} -> {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
}
|
||||
|
||||
/// Check if all components have recovered (no components in alert state)
|
||||
pub fn all_components_ok(&self) -> bool {
|
||||
self.components_in_alert.is_empty()
|
||||
}
|
||||
|
||||
/// Flush suppressed recovery notifications when all components are OK
|
||||
pub fn flush_recoveries_if_all_ok(&mut self) {
|
||||
if !self.config.recovery_requires_all_ok || self.all_components_ok() {
|
||||
if !self.pending_recoveries.is_empty() {
|
||||
info!("All components recovered, sending batch recovery notification");
|
||||
self.pending_notifications.append(&mut self.pending_recoveries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if it's time to flush aggregated notifications
|
||||
pub fn should_flush(&self) -> bool {
|
||||
if self.pending_notifications.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
match self.last_aggregation_flush {
|
||||
None => true, // First flush
|
||||
Some(last_flush) => {
|
||||
let aggregation_interval =
|
||||
Duration::from_secs(self.config.aggregation_interval_seconds);
|
||||
last_flush.elapsed() >= aggregation_interval
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush pending notifications as a single aggregated email
|
||||
pub async fn flush_notifications(&mut self) -> Result<()> {
|
||||
if self.pending_notifications.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if !self.config.enabled {
|
||||
self.pending_notifications.clear();
|
||||
self.last_aggregation_flush = Some(Instant::now());
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.is_maintenance_mode() {
|
||||
debug!("Maintenance mode active, suppressing email notification");
|
||||
debug!("Maintenance mode active, suppressing aggregated notifications");
|
||||
self.pending_notifications.clear();
|
||||
self.last_aggregation_flush = Some(Instant::now());
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let hostname = gethostname::gethostname()
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
let hostname = gethostname::gethostname().to_string_lossy().to_string();
|
||||
|
||||
// Build aggregated email
|
||||
let notification_count = self.pending_notifications.len();
|
||||
let alert_count = self.pending_notifications.iter().filter(|n| !n.is_recovery).count();
|
||||
let recovery_count = self.pending_notifications.iter().filter(|n| n.is_recovery).count();
|
||||
|
||||
let subject = if notification_count == 1 {
|
||||
let n = &self.pending_notifications[0];
|
||||
if n.is_recovery {
|
||||
format!("[{}] {} Recovered: {}", hostname, n.component, n.current_status)
|
||||
} else {
|
||||
format!("[{}] {} Alert: {}", hostname, n.component, n.current_status)
|
||||
}
|
||||
} else if recovery_count > 0 && alert_count == 0 {
|
||||
format!("[{}] {} Components Recovered", hostname, recovery_count)
|
||||
} else if alert_count > 0 && recovery_count == 0 {
|
||||
format!("[{}] {} Status Alerts", hostname, alert_count)
|
||||
} else {
|
||||
format!("[{}] {} Alerts, {} Recoveries", hostname, alert_count, recovery_count)
|
||||
};
|
||||
|
||||
let mut body = String::new();
|
||||
body.push_str(&format!("Status notifications for host: {}\n", hostname));
|
||||
body.push_str(&format!("Time: {}\n\n", Utc::now().format("%Y-%m-%d %H:%M:%S UTC")));
|
||||
|
||||
// Group alerts and recoveries
|
||||
let alerts: Vec<_> = self.pending_notifications.iter().filter(|n| !n.is_recovery).collect();
|
||||
let recoveries: Vec<_> = self.pending_notifications.iter().filter(|n| n.is_recovery).collect();
|
||||
|
||||
if !alerts.is_empty() {
|
||||
body.push_str("=== ALERTS ===\n\n");
|
||||
for notification in &alerts {
|
||||
body.push_str(&format!(
|
||||
"• {} : {} → {}\n {}\n ({})\n\n",
|
||||
notification.component,
|
||||
notification.previous_status,
|
||||
notification.current_status,
|
||||
notification.details,
|
||||
notification.timestamp.format("%H:%M:%S UTC")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if !recoveries.is_empty() {
|
||||
body.push_str("=== RECOVERIES ===\n\n");
|
||||
for notification in &recoveries {
|
||||
body.push_str(&format!(
|
||||
"• {} : {} → {}\n {}\n ({})\n\n",
|
||||
notification.component,
|
||||
notification.previous_status,
|
||||
notification.current_status,
|
||||
notification.details,
|
||||
notification.timestamp.format("%H:%M:%S UTC")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
body.push_str("--\nCM Dashboard Agent");
|
||||
|
||||
// Send the aggregated email
|
||||
let from_email = self.config.from_email.replace("{hostname}", &hostname);
|
||||
|
||||
let email_body = format!(
|
||||
"{}\n\n--\nCM Dashboard Agent\nGenerated at {}",
|
||||
body,
|
||||
Utc::now().format("%Y-%m-%d %H:%M:%S %Z")
|
||||
);
|
||||
|
||||
let email = Message::builder()
|
||||
.from(from_email.parse()?)
|
||||
.to(self.config.to_email.parse()?)
|
||||
.subject(subject)
|
||||
.body(email_body)?;
|
||||
.subject(&subject)
|
||||
.body(body)?;
|
||||
|
||||
let mailer = SmtpTransport::unencrypted_localhost();
|
||||
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
|
||||
.port(self.config.smtp_port)
|
||||
.build();
|
||||
|
||||
match mailer.send(&email) {
|
||||
Ok(_) => info!("Direct email sent successfully: {}", subject),
|
||||
Ok(_) => {
|
||||
info!(
|
||||
"Sent aggregated notification email with {} alerts",
|
||||
notification_count
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to send email: {}", e);
|
||||
error!("Failed to send aggregated email: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
self.pending_notifications.clear();
|
||||
self.last_aggregation_flush = Some(Instant::now());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user