Fix notification system with proper rate limiting and aggregation
All checks were successful
Build and Release / build-and-release (push) Successful in 1m51s
All checks were successful
Build and Release / build-and-release (push) Successful in 1m51s
- Add rate limiting using rate_limit_minutes config (was ignored) - Add aggregation using aggregation_interval_seconds config (was ignored) - Use smtp_host and smtp_port from config (was hardcoded localhost:25) - Add trigger_on_warnings and trigger_on_failures config options - Add recovery_requires_all_ok and suppress_individual_recoveries - Use check_interval_seconds from config (was hardcoded 30s) - Expand status tracking to all components (drives, pools, services, backup) - Move notification checks from every collection to dedicated interval - Separate alert and recovery notifications with proper email formatting - Only notify on failed services (Critical), not inactive
This commit is contained in:
6
Cargo.lock
generated
6
Cargo.lock
generated
@@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.276"
|
||||
version = "0.1.277"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@@ -301,7 +301,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.275"
|
||||
version = "0.1.277"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -325,7 +325,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.275"
|
||||
version = "0.1.277"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.275"
|
||||
version = "0.1.277"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use gethostname::gethostname;
|
||||
use std::collections::HashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::interval;
|
||||
use tracing::{debug, error, info};
|
||||
@@ -28,7 +29,6 @@ struct TimedCollector {
|
||||
}
|
||||
|
||||
pub struct Agent {
|
||||
hostname: String,
|
||||
config: AgentConfig,
|
||||
zmq_handler: ZmqHandler,
|
||||
collectors: Vec<TimedCollector>,
|
||||
@@ -38,12 +38,40 @@ pub struct Agent {
|
||||
}
|
||||
|
||||
/// Track system component status for change detection
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct SystemStatus {
|
||||
// CPU
|
||||
cpu_load_status: cm_dashboard_shared::Status,
|
||||
cpu_temperature_status: cm_dashboard_shared::Status,
|
||||
// Memory
|
||||
memory_usage_status: cm_dashboard_shared::Status,
|
||||
// Add more as needed
|
||||
// Storage - keyed by drive name or pool name
|
||||
drive_statuses: HashMap<String, DriveStatus>,
|
||||
pool_statuses: HashMap<String, PoolStatus>,
|
||||
// Services - keyed by service name
|
||||
service_statuses: HashMap<String, cm_dashboard_shared::Status>,
|
||||
// Backup
|
||||
backup_status: cm_dashboard_shared::Status,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct DriveStatus {
|
||||
temperature_status: cm_dashboard_shared::Status,
|
||||
health_status: cm_dashboard_shared::Status,
|
||||
filesystem_statuses: HashMap<String, cm_dashboard_shared::Status>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct PoolStatus {
|
||||
health_status: cm_dashboard_shared::Status,
|
||||
usage_status: cm_dashboard_shared::Status,
|
||||
drive_statuses: HashMap<String, PoolDriveStatus>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
struct PoolDriveStatus {
|
||||
health_status: cm_dashboard_shared::Status,
|
||||
temperature_status: cm_dashboard_shared::Status,
|
||||
}
|
||||
|
||||
impl Agent {
|
||||
@@ -148,7 +176,6 @@ impl Agent {
|
||||
let cached_agent_data = AgentData::new(hostname.clone(), env!("CARGO_PKG_VERSION").to_string());
|
||||
|
||||
Ok(Self {
|
||||
hostname,
|
||||
config,
|
||||
zmq_handler,
|
||||
collectors,
|
||||
@@ -171,7 +198,9 @@ impl Agent {
|
||||
let mut transmission_interval = interval(Duration::from_secs(
|
||||
self.config.zmq.transmission_interval_seconds,
|
||||
));
|
||||
let mut notification_interval = interval(Duration::from_secs(30)); // Check notifications every 30s
|
||||
let mut notification_interval = interval(Duration::from_secs(
|
||||
self.config.notifications.check_interval_seconds,
|
||||
));
|
||||
|
||||
// Skip initial ticks to avoid immediate execution
|
||||
transmission_interval.tick().await;
|
||||
@@ -185,9 +214,21 @@ impl Agent {
|
||||
}
|
||||
}
|
||||
_ = notification_interval.tick() => {
|
||||
// Process any pending notifications
|
||||
// NOTE: With structured data, we might need to implement status tracking differently
|
||||
// For now, we skip this until status evaluation is migrated
|
||||
// Check for status changes and queue notifications
|
||||
let agent_data_snapshot = self.cached_agent_data.clone();
|
||||
if let Err(e) = self.check_status_changes_and_notify(&agent_data_snapshot).await {
|
||||
error!("Failed to check status changes: {}", e);
|
||||
}
|
||||
|
||||
// Check if all components recovered and flush pending recoveries
|
||||
self.notification_manager.flush_recoveries_if_all_ok();
|
||||
|
||||
// Flush any pending aggregated notifications
|
||||
if self.notification_manager.should_flush() {
|
||||
if let Err(e) = self.notification_manager.flush_notifications().await {
|
||||
error!("Failed to flush notifications: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = &mut shutdown_rx => {
|
||||
info!("Shutdown signal received, stopping agent loop");
|
||||
@@ -235,16 +276,8 @@ impl Agent {
|
||||
.unwrap()
|
||||
.as_secs();
|
||||
|
||||
// Clone for notification check (to avoid borrow issues)
|
||||
let agent_data_snapshot = self.cached_agent_data.clone();
|
||||
|
||||
// Check for status changes and send notifications
|
||||
if let Err(e) = self.check_status_changes_and_notify(&agent_data_snapshot).await {
|
||||
error!("Failed to check status changes: {}", e);
|
||||
}
|
||||
|
||||
// Broadcast the cached structured data via ZMQ
|
||||
if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data_snapshot).await {
|
||||
if let Err(e) = self.zmq_handler.publish_agent_data(&self.cached_agent_data).await {
|
||||
error!("Failed to broadcast agent data: {}", e);
|
||||
} else {
|
||||
debug!("Successfully broadcast structured agent data");
|
||||
@@ -253,38 +286,182 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check for status changes and send notifications
|
||||
/// Check for status changes and queue notifications
|
||||
async fn check_status_changes_and_notify(&mut self, agent_data: &AgentData) -> Result<()> {
|
||||
// Extract current status
|
||||
let current_status = SystemStatus {
|
||||
cpu_load_status: agent_data.system.cpu.load_status.clone(),
|
||||
cpu_temperature_status: agent_data.system.cpu.temperature_status.clone(),
|
||||
memory_usage_status: agent_data.system.memory.usage_status.clone(),
|
||||
// Build current status from agent data
|
||||
let mut current_status = SystemStatus {
|
||||
cpu_load_status: agent_data.system.cpu.load_status,
|
||||
cpu_temperature_status: agent_data.system.cpu.temperature_status,
|
||||
memory_usage_status: agent_data.system.memory.usage_status,
|
||||
backup_status: agent_data.backup.backup_status,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// Check for status changes
|
||||
if let Some(previous) = self.previous_status.clone() {
|
||||
self.check_and_notify_status_change(
|
||||
// Collect drive statuses
|
||||
for drive in &agent_data.system.storage.drives {
|
||||
let mut fs_statuses = HashMap::new();
|
||||
for fs in &drive.filesystems {
|
||||
fs_statuses.insert(fs.mount.clone(), fs.usage_status);
|
||||
}
|
||||
current_status.drive_statuses.insert(
|
||||
drive.name.clone(),
|
||||
DriveStatus {
|
||||
temperature_status: drive.temperature_status,
|
||||
health_status: drive.health_status,
|
||||
filesystem_statuses: fs_statuses,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Collect pool statuses
|
||||
for pool in &agent_data.system.storage.pools {
|
||||
let mut pool_drive_statuses = HashMap::new();
|
||||
for drive in pool.data_drives.iter().chain(pool.parity_drives.iter()) {
|
||||
pool_drive_statuses.insert(
|
||||
drive.name.clone(),
|
||||
PoolDriveStatus {
|
||||
health_status: drive.health_status,
|
||||
temperature_status: drive.temperature_status,
|
||||
},
|
||||
);
|
||||
}
|
||||
current_status.pool_statuses.insert(
|
||||
pool.name.clone(),
|
||||
PoolStatus {
|
||||
health_status: pool.health_status,
|
||||
usage_status: pool.usage_status,
|
||||
drive_statuses: pool_drive_statuses,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Collect service statuses (only for non-user-stopped services)
|
||||
for service in &agent_data.services {
|
||||
if !service.user_stopped {
|
||||
current_status
|
||||
.service_statuses
|
||||
.insert(service.name.clone(), service.service_status);
|
||||
}
|
||||
}
|
||||
|
||||
// Clone previous status to avoid borrow issues
|
||||
let previous = self.previous_status.clone();
|
||||
|
||||
// Compare with previous status and queue notifications
|
||||
if let Some(previous) = previous {
|
||||
// CPU
|
||||
self.queue_status_notification(
|
||||
"CPU Load",
|
||||
&previous.cpu_load_status,
|
||||
¤t_status.cpu_load_status,
|
||||
format!("CPU load: {:.1}", agent_data.system.cpu.load_1min)
|
||||
).await?;
|
||||
|
||||
self.check_and_notify_status_change(
|
||||
"CPU Temperature",
|
||||
&format!("Load: {:.2}", agent_data.system.cpu.load_1min),
|
||||
);
|
||||
self.queue_status_notification(
|
||||
"CPU Temperature",
|
||||
&previous.cpu_temperature_status,
|
||||
¤t_status.cpu_temperature_status,
|
||||
format!("CPU temperature: {}°C",
|
||||
agent_data.system.cpu.temperature_celsius.unwrap_or(0.0) as i32)
|
||||
).await?;
|
||||
&format!(
|
||||
"Temperature: {}°C",
|
||||
agent_data.system.cpu.temperature_celsius.unwrap_or(0.0) as i32
|
||||
),
|
||||
);
|
||||
|
||||
self.check_and_notify_status_change(
|
||||
"Memory Usage",
|
||||
&previous.memory_usage_status,
|
||||
// Memory
|
||||
self.queue_status_notification(
|
||||
"Memory",
|
||||
&previous.memory_usage_status,
|
||||
¤t_status.memory_usage_status,
|
||||
format!("Memory usage: {:.1}%", agent_data.system.memory.usage_percent)
|
||||
).await?;
|
||||
&format!("Usage: {:.1}%", agent_data.system.memory.usage_percent),
|
||||
);
|
||||
|
||||
// Backup
|
||||
self.queue_status_notification(
|
||||
"Backup",
|
||||
&previous.backup_status,
|
||||
¤t_status.backup_status,
|
||||
&format!(
|
||||
"Last backup: {}",
|
||||
agent_data.backup.last_backup_time.as_deref().unwrap_or("unknown")
|
||||
),
|
||||
);
|
||||
|
||||
// Drives
|
||||
for (name, current_drive) in ¤t_status.drive_statuses {
|
||||
if let Some(prev_drive) = previous.drive_statuses.get(name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Drive {} Health", name),
|
||||
&prev_drive.health_status,
|
||||
¤t_drive.health_status,
|
||||
"Health check failed",
|
||||
);
|
||||
self.queue_status_notification(
|
||||
&format!("Drive {} Temperature", name),
|
||||
&prev_drive.temperature_status,
|
||||
¤t_drive.temperature_status,
|
||||
"Temperature threshold exceeded",
|
||||
);
|
||||
|
||||
// Filesystem usage
|
||||
for (mount, current_fs_status) in ¤t_drive.filesystem_statuses {
|
||||
if let Some(prev_fs_status) = prev_drive.filesystem_statuses.get(mount) {
|
||||
self.queue_status_notification(
|
||||
&format!("Filesystem {}", mount),
|
||||
prev_fs_status,
|
||||
current_fs_status,
|
||||
"Disk usage threshold exceeded",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pools
|
||||
for (name, current_pool) in ¤t_status.pool_statuses {
|
||||
if let Some(prev_pool) = previous.pool_statuses.get(name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Health", name),
|
||||
&prev_pool.health_status,
|
||||
¤t_pool.health_status,
|
||||
"Pool health degraded",
|
||||
);
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Usage", name),
|
||||
&prev_pool.usage_status,
|
||||
¤t_pool.usage_status,
|
||||
"Pool usage threshold exceeded",
|
||||
);
|
||||
|
||||
// Pool drives
|
||||
for (drive_name, current_pd) in ¤t_pool.drive_statuses {
|
||||
if let Some(prev_pd) = prev_pool.drive_statuses.get(drive_name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Drive {} Health", name, drive_name),
|
||||
&prev_pd.health_status,
|
||||
¤t_pd.health_status,
|
||||
"Pool drive health degraded",
|
||||
);
|
||||
self.queue_status_notification(
|
||||
&format!("Pool {} Drive {} Temperature", name, drive_name),
|
||||
&prev_pd.temperature_status,
|
||||
¤t_pd.temperature_status,
|
||||
"Pool drive temperature exceeded",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Services
|
||||
for (name, current_svc_status) in ¤t_status.service_statuses {
|
||||
if let Some(prev_svc_status) = previous.service_statuses.get(name) {
|
||||
self.queue_status_notification(
|
||||
&format!("Service {}", name),
|
||||
prev_svc_status,
|
||||
current_svc_status,
|
||||
"Service status changed",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Store current status for next comparison
|
||||
@@ -292,43 +469,44 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check individual status change and send notification if degraded
|
||||
async fn check_and_notify_status_change(
|
||||
/// Queue a notification based on status change
|
||||
fn queue_status_notification(
|
||||
&mut self,
|
||||
component: &str,
|
||||
previous: &cm_dashboard_shared::Status,
|
||||
current: &cm_dashboard_shared::Status,
|
||||
details: String
|
||||
) -> Result<()> {
|
||||
details: &str,
|
||||
) {
|
||||
use cm_dashboard_shared::Status;
|
||||
|
||||
// Only notify on status degradation (OK → Warning/Critical, Warning → Critical)
|
||||
let should_notify = match (previous, current) {
|
||||
(Status::Ok, Status::Warning) => true,
|
||||
(Status::Ok, Status::Critical) => true,
|
||||
(Status::Warning, Status::Critical) => true,
|
||||
_ => false,
|
||||
};
|
||||
// Check for degradation (alert)
|
||||
let is_alert = matches!(
|
||||
(previous, current),
|
||||
(Status::Ok, Status::Warning)
|
||||
| (Status::Ok, Status::Critical)
|
||||
| (Status::Warning, Status::Critical)
|
||||
);
|
||||
|
||||
if should_notify {
|
||||
let subject = format!("{} {} Alert", self.hostname, component);
|
||||
let body = format!(
|
||||
"Alert: {} status changed from {:?} to {:?}\n\nDetails: {}\n\nTime: {}",
|
||||
component,
|
||||
previous,
|
||||
current,
|
||||
details,
|
||||
chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
|
||||
// Check for recovery
|
||||
let is_recovery = matches!(
|
||||
(previous, current),
|
||||
(Status::Warning, Status::Ok)
|
||||
| (Status::Critical, Status::Ok)
|
||||
| (Status::Critical, Status::Warning)
|
||||
);
|
||||
|
||||
if is_alert {
|
||||
info!(
|
||||
"Alert: {} - {:?} → {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
|
||||
info!("Sending notification: {} - {:?} → {:?}", component, previous, current);
|
||||
|
||||
if let Err(e) = self.notification_manager.send_direct_email(&subject, &body).await {
|
||||
error!("Failed to send notification for {}: {}", component, e);
|
||||
}
|
||||
self.notification_manager.queue_alert(component, previous, current, details);
|
||||
} else if is_recovery {
|
||||
info!(
|
||||
"Recovery: {} - {:?} → {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
self.notification_manager.queue_recovery(component, previous, current, details);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
@@ -953,15 +953,21 @@ impl SystemdCollector {
|
||||
"-s",
|
||||
"--max-time",
|
||||
"4",
|
||||
"https://ifconfig.me"
|
||||
"https://1.1.1.1/cdn-cgi/trace"
|
||||
])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
if output.status.success() {
|
||||
let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||
if !ip.is_empty() && ip.contains('.') {
|
||||
return Some(ip);
|
||||
let response = String::from_utf8_lossy(&output.stdout);
|
||||
// Parse "ip=x.x.x.x" from the response
|
||||
for line in response.lines() {
|
||||
if let Some(ip) = line.strip_prefix("ip=") {
|
||||
let ip = ip.trim().to_string();
|
||||
if !ip.is_empty() {
|
||||
return Some(ip);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -141,8 +141,23 @@ pub struct NotificationConfig {
|
||||
pub from_email: String,
|
||||
pub to_email: String,
|
||||
pub rate_limit_minutes: u64,
|
||||
/// Whether to send notifications on warning status
|
||||
#[serde(default = "default_true")]
|
||||
pub trigger_on_warnings: bool,
|
||||
/// Whether to send notifications on failure/critical status
|
||||
#[serde(default = "default_true")]
|
||||
pub trigger_on_failures: bool,
|
||||
/// Only send recovery notification when all components are OK
|
||||
#[serde(default)]
|
||||
pub recovery_requires_all_ok: bool,
|
||||
/// Suppress individual recovery notifications (only notify on full recovery)
|
||||
#[serde(default)]
|
||||
pub suppress_individual_recoveries: bool,
|
||||
/// Email notification batching interval in seconds (default: 60)
|
||||
pub aggregation_interval_seconds: u64,
|
||||
/// How often to check for status changes in seconds (default: 30)
|
||||
#[serde(default = "default_check_interval_seconds")]
|
||||
pub check_interval_seconds: u64,
|
||||
/// List of metric names to exclude from email notifications
|
||||
#[serde(default)]
|
||||
pub exclude_email_metrics: Vec<String>,
|
||||
@@ -151,6 +166,14 @@ pub struct NotificationConfig {
|
||||
pub maintenance_mode_file: String,
|
||||
}
|
||||
|
||||
fn default_true() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn default_check_interval_seconds() -> u64 {
|
||||
30
|
||||
}
|
||||
|
||||
|
||||
fn default_heartbeat_interval_seconds() -> u64 {
|
||||
5
|
||||
|
||||
@@ -1,60 +1,314 @@
|
||||
use crate::config::NotificationConfig;
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use cm_dashboard_shared::Status;
|
||||
use lettre::transport::smtp::SmtpTransport;
|
||||
use lettre::{Message, Transport};
|
||||
use std::collections::HashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
/// Manages notifications
|
||||
/// Manages notifications with rate limiting and aggregation
|
||||
pub struct NotificationManager {
|
||||
config: NotificationConfig,
|
||||
/// Last notification time per component for rate limiting
|
||||
last_notification: HashMap<String, Instant>,
|
||||
/// Pending notifications for aggregation
|
||||
pending_notifications: Vec<PendingNotification>,
|
||||
/// Pending recovery notifications (held until all OK if configured)
|
||||
pending_recoveries: Vec<PendingNotification>,
|
||||
/// Last aggregation flush time
|
||||
last_aggregation_flush: Option<Instant>,
|
||||
/// Track components currently in alert state
|
||||
components_in_alert: HashMap<String, Status>,
|
||||
}
|
||||
|
||||
/// A pending notification waiting to be aggregated
|
||||
#[derive(Debug, Clone)]
|
||||
struct PendingNotification {
|
||||
component: String,
|
||||
previous_status: String,
|
||||
current_status: String,
|
||||
details: String,
|
||||
timestamp: chrono::DateTime<Utc>,
|
||||
is_recovery: bool,
|
||||
}
|
||||
|
||||
impl NotificationManager {
|
||||
pub fn new(config: &NotificationConfig, _hostname: &str) -> Result<Self> {
|
||||
Ok(Self {
|
||||
config: config.clone(),
|
||||
last_notification: HashMap::new(),
|
||||
pending_notifications: Vec::new(),
|
||||
pending_recoveries: Vec::new(),
|
||||
last_aggregation_flush: None,
|
||||
components_in_alert: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn send_direct_email(&mut self, subject: &str, body: &str) -> Result<()> {
|
||||
/// Check if a component is rate limited
|
||||
fn is_rate_limited(&self, component: &str) -> bool {
|
||||
if self.config.rate_limit_minutes == 0 {
|
||||
return false;
|
||||
}
|
||||
if let Some(last_time) = self.last_notification.get(component) {
|
||||
let rate_limit = Duration::from_secs(self.config.rate_limit_minutes * 60);
|
||||
last_time.elapsed() < rate_limit
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Queue a degradation notification (Ok→Warning, Ok→Critical, Warning→Critical)
|
||||
pub fn queue_alert(
|
||||
&mut self,
|
||||
component: &str,
|
||||
previous: &Status,
|
||||
current: &Status,
|
||||
details: &str,
|
||||
) {
|
||||
// Check if this status type should trigger notifications
|
||||
// Only Warning and Critical trigger notifications (not Inactive)
|
||||
let should_notify = match current {
|
||||
Status::Warning => self.config.trigger_on_warnings,
|
||||
Status::Critical => self.config.trigger_on_failures,
|
||||
_ => false,
|
||||
};
|
||||
|
||||
if !should_notify {
|
||||
debug!(
|
||||
"Notification for {} suppressed (trigger_on_warnings={}, trigger_on_failures={})",
|
||||
component, self.config.trigger_on_warnings, self.config.trigger_on_failures
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check rate limit
|
||||
if self.is_rate_limited(component) {
|
||||
debug!(
|
||||
"Notification for {} rate limited (limit: {} min)",
|
||||
component, self.config.rate_limit_minutes
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check exclusions
|
||||
if self.config.exclude_email_metrics.iter().any(|e| component.contains(e)) {
|
||||
debug!("Notification for {} excluded by config", component);
|
||||
return;
|
||||
}
|
||||
|
||||
// Track this component as in alert state
|
||||
self.components_in_alert.insert(component.to_string(), *current);
|
||||
|
||||
self.pending_notifications.push(PendingNotification {
|
||||
component: component.to_string(),
|
||||
previous_status: format!("{:?}", previous),
|
||||
current_status: format!("{:?}", current),
|
||||
details: details.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
is_recovery: false,
|
||||
});
|
||||
|
||||
// Update rate limit tracker
|
||||
self.last_notification.insert(component.to_string(), Instant::now());
|
||||
|
||||
debug!(
|
||||
"Queued alert for {}: {:?} -> {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
}
|
||||
|
||||
/// Queue a recovery notification (Warning→Ok, Critical→Ok, Critical→Warning)
|
||||
pub fn queue_recovery(
|
||||
&mut self,
|
||||
component: &str,
|
||||
previous: &Status,
|
||||
current: &Status,
|
||||
details: &str,
|
||||
) {
|
||||
// Remove from alert tracking
|
||||
self.components_in_alert.remove(component);
|
||||
|
||||
// Check if individual recoveries are suppressed
|
||||
if self.config.suppress_individual_recoveries {
|
||||
debug!(
|
||||
"Individual recovery for {} suppressed by config",
|
||||
component
|
||||
);
|
||||
|
||||
// Store recovery for potential batch notification
|
||||
self.pending_recoveries.push(PendingNotification {
|
||||
component: component.to_string(),
|
||||
previous_status: format!("{:?}", previous),
|
||||
current_status: format!("{:?}", current),
|
||||
details: details.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
is_recovery: true,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Check exclusions
|
||||
if self.config.exclude_email_metrics.iter().any(|e| component.contains(e)) {
|
||||
debug!("Recovery notification for {} excluded by config", component);
|
||||
return;
|
||||
}
|
||||
|
||||
self.pending_notifications.push(PendingNotification {
|
||||
component: component.to_string(),
|
||||
previous_status: format!("{:?}", previous),
|
||||
current_status: format!("{:?}", current),
|
||||
details: details.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
is_recovery: true,
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Queued recovery for {}: {:?} -> {:?}",
|
||||
component, previous, current
|
||||
);
|
||||
}
|
||||
|
||||
/// Check if all components have recovered (no components in alert state)
|
||||
pub fn all_components_ok(&self) -> bool {
|
||||
self.components_in_alert.is_empty()
|
||||
}
|
||||
|
||||
/// Flush suppressed recovery notifications when all components are OK
|
||||
pub fn flush_recoveries_if_all_ok(&mut self) {
|
||||
if !self.config.recovery_requires_all_ok || self.all_components_ok() {
|
||||
if !self.pending_recoveries.is_empty() {
|
||||
info!("All components recovered, sending batch recovery notification");
|
||||
self.pending_notifications.append(&mut self.pending_recoveries);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if it's time to flush aggregated notifications
|
||||
pub fn should_flush(&self) -> bool {
|
||||
if self.pending_notifications.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
match self.last_aggregation_flush {
|
||||
None => true, // First flush
|
||||
Some(last_flush) => {
|
||||
let aggregation_interval =
|
||||
Duration::from_secs(self.config.aggregation_interval_seconds);
|
||||
last_flush.elapsed() >= aggregation_interval
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush pending notifications as a single aggregated email
|
||||
pub async fn flush_notifications(&mut self) -> Result<()> {
|
||||
if self.pending_notifications.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if !self.config.enabled {
|
||||
self.pending_notifications.clear();
|
||||
self.last_aggregation_flush = Some(Instant::now());
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.is_maintenance_mode() {
|
||||
debug!("Maintenance mode active, suppressing email notification");
|
||||
debug!("Maintenance mode active, suppressing aggregated notifications");
|
||||
self.pending_notifications.clear();
|
||||
self.last_aggregation_flush = Some(Instant::now());
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let hostname = gethostname::gethostname()
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
let hostname = gethostname::gethostname().to_string_lossy().to_string();
|
||||
|
||||
// Build aggregated email
|
||||
let notification_count = self.pending_notifications.len();
|
||||
let alert_count = self.pending_notifications.iter().filter(|n| !n.is_recovery).count();
|
||||
let recovery_count = self.pending_notifications.iter().filter(|n| n.is_recovery).count();
|
||||
|
||||
let subject = if notification_count == 1 {
|
||||
let n = &self.pending_notifications[0];
|
||||
if n.is_recovery {
|
||||
format!("[{}] {} Recovered: {}", hostname, n.component, n.current_status)
|
||||
} else {
|
||||
format!("[{}] {} Alert: {}", hostname, n.component, n.current_status)
|
||||
}
|
||||
} else if recovery_count > 0 && alert_count == 0 {
|
||||
format!("[{}] {} Components Recovered", hostname, recovery_count)
|
||||
} else if alert_count > 0 && recovery_count == 0 {
|
||||
format!("[{}] {} Status Alerts", hostname, alert_count)
|
||||
} else {
|
||||
format!("[{}] {} Alerts, {} Recoveries", hostname, alert_count, recovery_count)
|
||||
};
|
||||
|
||||
let mut body = String::new();
|
||||
body.push_str(&format!("Status notifications for host: {}\n", hostname));
|
||||
body.push_str(&format!("Time: {}\n\n", Utc::now().format("%Y-%m-%d %H:%M:%S UTC")));
|
||||
|
||||
// Group alerts and recoveries
|
||||
let alerts: Vec<_> = self.pending_notifications.iter().filter(|n| !n.is_recovery).collect();
|
||||
let recoveries: Vec<_> = self.pending_notifications.iter().filter(|n| n.is_recovery).collect();
|
||||
|
||||
if !alerts.is_empty() {
|
||||
body.push_str("=== ALERTS ===\n\n");
|
||||
for notification in &alerts {
|
||||
body.push_str(&format!(
|
||||
"• {} : {} → {}\n {}\n ({})\n\n",
|
||||
notification.component,
|
||||
notification.previous_status,
|
||||
notification.current_status,
|
||||
notification.details,
|
||||
notification.timestamp.format("%H:%M:%S UTC")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
if !recoveries.is_empty() {
|
||||
body.push_str("=== RECOVERIES ===\n\n");
|
||||
for notification in &recoveries {
|
||||
body.push_str(&format!(
|
||||
"• {} : {} → {}\n {}\n ({})\n\n",
|
||||
notification.component,
|
||||
notification.previous_status,
|
||||
notification.current_status,
|
||||
notification.details,
|
||||
notification.timestamp.format("%H:%M:%S UTC")
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
body.push_str("--\nCM Dashboard Agent");
|
||||
|
||||
// Send the aggregated email
|
||||
let from_email = self.config.from_email.replace("{hostname}", &hostname);
|
||||
|
||||
let email_body = format!(
|
||||
"{}\n\n--\nCM Dashboard Agent\nGenerated at {}",
|
||||
body,
|
||||
Utc::now().format("%Y-%m-%d %H:%M:%S %Z")
|
||||
);
|
||||
|
||||
let email = Message::builder()
|
||||
.from(from_email.parse()?)
|
||||
.to(self.config.to_email.parse()?)
|
||||
.subject(subject)
|
||||
.body(email_body)?;
|
||||
.subject(&subject)
|
||||
.body(body)?;
|
||||
|
||||
let mailer = SmtpTransport::unencrypted_localhost();
|
||||
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
|
||||
.port(self.config.smtp_port)
|
||||
.build();
|
||||
|
||||
match mailer.send(&email) {
|
||||
Ok(_) => info!("Direct email sent successfully: {}", subject),
|
||||
Ok(_) => {
|
||||
info!(
|
||||
"Sent aggregated notification email with {} alerts",
|
||||
notification_count
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to send email: {}", e);
|
||||
error!("Failed to send aggregated email: {}", e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
self.pending_notifications.clear();
|
||||
self.last_aggregation_flush = Some(Instant::now());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.276"
|
||||
version = "0.1.277"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -514,14 +514,6 @@ impl TuiApp {
|
||||
true // No host selected is considered offline
|
||||
};
|
||||
|
||||
// If host is offline, render wake-up message instead of panels
|
||||
if current_host_offline {
|
||||
self.render_offline_host_message(frame, main_chunks[1]);
|
||||
self.render_btop_title(frame, main_chunks[0], metric_store);
|
||||
self.render_statusbar(frame, main_chunks[2], metric_store);
|
||||
return (main_chunks[0], Rect::default(), Rect::default()); // Return title area and empty areas when offline
|
||||
}
|
||||
|
||||
// Left side: system panel only (full height)
|
||||
let left_chunks = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
@@ -531,9 +523,13 @@ impl TuiApp {
|
||||
// Render title bar
|
||||
self.render_btop_title(frame, main_chunks[0], metric_store);
|
||||
|
||||
// Render system panel
|
||||
// Render system panel or offline message in system panel area
|
||||
let system_area = left_chunks[0];
|
||||
self.render_system_panel(frame, system_area, metric_store);
|
||||
if current_host_offline {
|
||||
self.render_offline_host_message(frame, system_area);
|
||||
} else {
|
||||
self.render_system_panel(frame, system_area, metric_store);
|
||||
}
|
||||
|
||||
// Render right panel with tabs (Services | Hosts)
|
||||
let services_area = content_chunks[1];
|
||||
@@ -829,9 +825,8 @@ impl TuiApp {
|
||||
}
|
||||
|
||||
|
||||
/// Render offline host message with wake-up option
|
||||
/// Render offline host message in system panel area
|
||||
fn render_offline_host_message(&self, frame: &mut Frame, area: Rect) {
|
||||
use ratatui::layout::Alignment;
|
||||
use ratatui::style::Modifier;
|
||||
use ratatui::text::{Line, Span};
|
||||
use ratatui::widgets::{Block, Borders, Paragraph};
|
||||
@@ -849,8 +844,9 @@ impl TuiApp {
|
||||
|
||||
// Create message content
|
||||
let mut lines = vec![
|
||||
Line::from(""),
|
||||
Line::from(Span::styled(
|
||||
format!("Host '{}' is offline", hostname),
|
||||
format!(" Host '{}' is offline", hostname),
|
||||
Style::default().fg(Theme::muted_text()).add_modifier(Modifier::BOLD),
|
||||
)),
|
||||
Line::from(""),
|
||||
@@ -858,46 +854,26 @@ impl TuiApp {
|
||||
|
||||
if has_mac {
|
||||
lines.push(Line::from(Span::styled(
|
||||
"Press 'w' to wake up host",
|
||||
Style::default().fg(Theme::primary_text()).add_modifier(Modifier::BOLD),
|
||||
" Press 'w' to wake up host",
|
||||
Style::default().fg(Theme::primary_text()),
|
||||
)));
|
||||
} else {
|
||||
lines.push(Line::from(Span::styled(
|
||||
"No MAC address configured - cannot wake up",
|
||||
" No MAC address configured",
|
||||
Style::default().fg(Theme::muted_text()),
|
||||
)));
|
||||
}
|
||||
|
||||
// Create centered message
|
||||
// Render message in system panel with border
|
||||
let message = Paragraph::new(lines)
|
||||
.block(Block::default()
|
||||
.borders(Borders::ALL)
|
||||
.border_style(Style::default().fg(Theme::muted_text()))
|
||||
.title(" Offline Host ")
|
||||
.title(" Offline ")
|
||||
.title_style(Style::default().fg(Theme::muted_text()).add_modifier(Modifier::BOLD)))
|
||||
.style(Style::default().bg(Theme::background()).fg(Theme::primary_text()))
|
||||
.alignment(Alignment::Center);
|
||||
.style(Style::default().bg(Theme::background()).fg(Theme::primary_text()));
|
||||
|
||||
// Center the message in the available area
|
||||
let popup_area = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Percentage(40),
|
||||
Constraint::Length(6),
|
||||
Constraint::Percentage(40),
|
||||
])
|
||||
.split(area)[1];
|
||||
|
||||
let popup_area = ratatui::layout::Layout::default()
|
||||
.direction(Direction::Horizontal)
|
||||
.constraints([
|
||||
Constraint::Percentage(25),
|
||||
Constraint::Percentage(50),
|
||||
Constraint::Percentage(25),
|
||||
])
|
||||
.split(popup_area)[1];
|
||||
|
||||
frame.render_widget(message, popup_area);
|
||||
frame.render_widget(message, area);
|
||||
}
|
||||
|
||||
/// Parse MAC address string (e.g., "AA:BB:CC:DD:EE:FF") to [u8; 6]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.275"
|
||||
version = "0.1.277"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
Reference in New Issue
Block a user