Implement comprehensive status calculation and notification system
Agent Changes: • Add CPU status thresholds (warning: ≥5.0, critical: ≥8.0) • Add memory status thresholds (warning: ≥80%, critical: ≥95%) • Add service status calculation (critical if failed>0, warning if degraded>0) • All collectors now calculate and include status in output Dashboard Changes: • Update system widget to use agent-calculated cpu_status and memory_status • Update services widget to use agent-calculated services_status • Remove client-side status calculations in favor of agent status • Add status_level_from_agent_status helper function Notification System: • Add SMTP email notification system using lettre crate • Auto-configure notifications: hostname@cmtec.se → cm@cmtec.se • Smart change detection with rate limiting (30min cooldown) • Only notify on transitions to/from warning/critical states • Rich email formatting with host, component, metric details
This commit is contained in:
@@ -21,3 +21,4 @@ tokio = { version = "1.0", features = ["full", "process"] }
|
||||
futures = "0.3"
|
||||
rand = "0.8"
|
||||
gethostname = "0.4"
|
||||
lettre = { version = "0.11", default-features = false, features = ["smtp-transport", "builder"] }
|
||||
|
||||
@@ -5,12 +5,15 @@ use tracing::info;
|
||||
|
||||
use crate::collectors::CollectorError;
|
||||
use crate::discovery::AutoDiscovery;
|
||||
use crate::notifications::NotificationConfig;
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct AgentConfig {
|
||||
pub agent: AgentSettings,
|
||||
pub zmq: ZmqSettings,
|
||||
pub collectors: CollectorsConfig,
|
||||
#[serde(default)]
|
||||
pub notifications: NotificationConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
@@ -254,6 +257,13 @@ impl AgentConfig {
|
||||
info!("Backup monitoring disabled for this host");
|
||||
}
|
||||
|
||||
// Auto-configure notifications
|
||||
self.notifications.enabled = true;
|
||||
self.notifications.from_email = format!("{}@cmtec.se", hostname);
|
||||
self.notifications.to_email = "cm@cmtec.se".to_string();
|
||||
info!("Auto-configured notifications: {} -> {}",
|
||||
self.notifications.from_email, self.notifications.to_email);
|
||||
|
||||
// Apply host-specific timing optimizations
|
||||
self.apply_host_timing_overrides(hostname);
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ mod agent;
|
||||
mod collectors;
|
||||
mod config;
|
||||
mod discovery;
|
||||
mod notifications;
|
||||
mod scheduler;
|
||||
|
||||
use agent::MetricsAgent;
|
||||
|
||||
182
agent/src/notifications.rs
Normal file
182
agent/src/notifications.rs
Normal file
@@ -0,0 +1,182 @@
|
||||
use std::collections::HashMap;
|
||||
use chrono::{DateTime, Utc};
|
||||
use lettre::{Message, SmtpTransport, Transport};
|
||||
use tracing::{info, error, warn};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NotificationConfig {
|
||||
pub enabled: bool,
|
||||
pub smtp_host: String,
|
||||
pub smtp_port: u16,
|
||||
pub from_email: String,
|
||||
pub to_email: String,
|
||||
pub rate_limit_minutes: u64,
|
||||
}
|
||||
|
||||
impl Default for NotificationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
enabled: false,
|
||||
smtp_host: "localhost".to_string(),
|
||||
smtp_port: 25,
|
||||
from_email: "".to_string(),
|
||||
to_email: "".to_string(),
|
||||
rate_limit_minutes: 30, // Don't spam notifications
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct StatusChange {
|
||||
pub component: String,
|
||||
pub metric: String,
|
||||
pub old_status: String,
|
||||
pub new_status: String,
|
||||
pub timestamp: DateTime<Utc>,
|
||||
}
|
||||
|
||||
pub struct NotificationManager {
|
||||
config: NotificationConfig,
|
||||
last_status: HashMap<String, String>, // key: "component.metric", value: status
|
||||
last_notification: HashMap<String, DateTime<Utc>>, // Rate limiting
|
||||
}
|
||||
|
||||
impl NotificationManager {
|
||||
pub fn new(config: NotificationConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_status: HashMap::new(),
|
||||
last_notification: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_status(&mut self, component: &str, metric: &str, status: &str) -> Option<StatusChange> {
|
||||
let key = format!("{}.{}", component, metric);
|
||||
let old_status = self.last_status.get(&key).cloned();
|
||||
|
||||
if let Some(old) = &old_status {
|
||||
if old != status {
|
||||
let change = StatusChange {
|
||||
component: component.to_string(),
|
||||
metric: metric.to_string(),
|
||||
old_status: old.clone(),
|
||||
new_status: status.to_string(),
|
||||
timestamp: Utc::now(),
|
||||
};
|
||||
|
||||
self.last_status.insert(key, status.to_string());
|
||||
|
||||
if self.should_notify(&change) {
|
||||
return Some(change);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// First time seeing this metric - store but don't notify
|
||||
self.last_status.insert(key, status.to_string());
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn should_notify(&mut self, change: &StatusChange) -> bool {
|
||||
if !self.config.enabled {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Only notify on transitions to warning/critical, or recovery to ok
|
||||
match (change.old_status.as_str(), change.new_status.as_str()) {
|
||||
(_, "warning") | (_, "critical") => true,
|
||||
("warning" | "critical", "ok") => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_rate_limited(&mut self, change: &StatusChange) -> bool {
|
||||
let key = format!("{}.{}", change.component, change.metric);
|
||||
|
||||
if let Some(last_time) = self.last_notification.get(&key) {
|
||||
let minutes_since = Utc::now().signed_duration_since(*last_time).num_minutes();
|
||||
if minutes_since < self.config.rate_limit_minutes as i64 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
self.last_notification.insert(key, Utc::now());
|
||||
false
|
||||
}
|
||||
|
||||
pub async fn send_notification(&mut self, change: StatusChange) {
|
||||
if !self.config.enabled {
|
||||
return;
|
||||
}
|
||||
|
||||
if self.is_rate_limited(&change) {
|
||||
warn!("Rate limiting notification for {}.{}", change.component, change.metric);
|
||||
return;
|
||||
}
|
||||
|
||||
let subject = self.format_subject(&change);
|
||||
let body = self.format_body(&change);
|
||||
|
||||
if let Err(e) = self.send_email(&subject, &body).await {
|
||||
error!("Failed to send notification email: {}", e);
|
||||
} else {
|
||||
info!("Sent notification: {} {}.{} {} → {}",
|
||||
change.component, change.component, change.metric,
|
||||
change.old_status, change.new_status);
|
||||
}
|
||||
}
|
||||
|
||||
fn format_subject(&self, change: &StatusChange) -> String {
|
||||
let urgency = match change.new_status.as_str() {
|
||||
"critical" => "🔴 CRITICAL",
|
||||
"warning" => "🟡 WARNING",
|
||||
"ok" => "✅ RESOLVED",
|
||||
_ => "ℹ️ STATUS",
|
||||
};
|
||||
|
||||
format!("{}: {} {} on {}",
|
||||
urgency,
|
||||
change.component,
|
||||
change.metric,
|
||||
gethostname::gethostname().to_string_lossy())
|
||||
}
|
||||
|
||||
fn format_body(&self, change: &StatusChange) -> String {
|
||||
format!(
|
||||
"Status Change Alert\n\
|
||||
\n\
|
||||
Host: {}\n\
|
||||
Component: {}\n\
|
||||
Metric: {}\n\
|
||||
Status Change: {} → {}\n\
|
||||
Time: {}\n\
|
||||
\n\
|
||||
--\n\
|
||||
CM Dashboard Agent\n\
|
||||
Generated at {}",
|
||||
gethostname::gethostname().to_string_lossy(),
|
||||
change.component,
|
||||
change.metric,
|
||||
change.old_status,
|
||||
change.new_status,
|
||||
change.timestamp.format("%Y-%m-%d %H:%M:%S UTC"),
|
||||
Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
|
||||
)
|
||||
}
|
||||
|
||||
async fn send_email(&self, subject: &str, body: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let email = Message::builder()
|
||||
.from(self.config.from_email.parse()?)
|
||||
.to(self.config.to_email.parse()?)
|
||||
.subject(subject)
|
||||
.body(body.to_string())?;
|
||||
|
||||
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
|
||||
.port(self.config.smtp_port)
|
||||
.build();
|
||||
|
||||
mailer.send(&email)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user