Implement comprehensive status calculation and notification system

Agent Changes:
• Add CPU status thresholds (warning: ≥5.0, critical: ≥8.0)
• Add memory status thresholds (warning: ≥80%, critical: ≥95%)
• Add service status calculation (critical if failed>0, warning if degraded>0)
• All collectors now calculate and include status in output

Dashboard Changes:
• Update system widget to use agent-calculated cpu_status and memory_status
• Update services widget to use agent-calculated services_status
• Remove client-side status calculations in favor of agent status
• Add status_level_from_agent_status helper function

Notification System:
• Add SMTP email notification system using lettre crate
• Auto-configure notifications: hostname@cmtec.secm@cmtec.se
• Smart change detection with rate limiting (30min cooldown)
• Only notify on transitions to/from warning/critical states
• Rich email formatting with host, component, metric details
This commit is contained in:
2025-10-12 20:04:40 +02:00
parent 59bc3adad5
commit d08d8f306a
5 changed files with 609 additions and 0 deletions

View File

@@ -21,3 +21,4 @@ tokio = { version = "1.0", features = ["full", "process"] }
futures = "0.3"
rand = "0.8"
gethostname = "0.4"
lettre = { version = "0.11", default-features = false, features = ["smtp-transport", "builder"] }

View File

@@ -5,12 +5,15 @@ use tracing::info;
use crate::collectors::CollectorError;
use crate::discovery::AutoDiscovery;
use crate::notifications::NotificationConfig;
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct AgentConfig {
pub agent: AgentSettings,
pub zmq: ZmqSettings,
pub collectors: CollectorsConfig,
#[serde(default)]
pub notifications: NotificationConfig,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
@@ -254,6 +257,13 @@ impl AgentConfig {
info!("Backup monitoring disabled for this host");
}
// Auto-configure notifications
self.notifications.enabled = true;
self.notifications.from_email = format!("{}@cmtec.se", hostname);
self.notifications.to_email = "cm@cmtec.se".to_string();
info!("Auto-configured notifications: {} -> {}",
self.notifications.from_email, self.notifications.to_email);
// Apply host-specific timing optimizations
self.apply_host_timing_overrides(hostname);

View File

@@ -9,6 +9,7 @@ mod agent;
mod collectors;
mod config;
mod discovery;
mod notifications;
mod scheduler;
use agent::MetricsAgent;

182
agent/src/notifications.rs Normal file
View File

@@ -0,0 +1,182 @@
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use lettre::{Message, SmtpTransport, Transport};
use tracing::{info, error, warn};
#[derive(Debug, Clone)]
pub struct NotificationConfig {
pub enabled: bool,
pub smtp_host: String,
pub smtp_port: u16,
pub from_email: String,
pub to_email: String,
pub rate_limit_minutes: u64,
}
impl Default for NotificationConfig {
fn default() -> Self {
Self {
enabled: false,
smtp_host: "localhost".to_string(),
smtp_port: 25,
from_email: "".to_string(),
to_email: "".to_string(),
rate_limit_minutes: 30, // Don't spam notifications
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct StatusChange {
pub component: String,
pub metric: String,
pub old_status: String,
pub new_status: String,
pub timestamp: DateTime<Utc>,
}
pub struct NotificationManager {
config: NotificationConfig,
last_status: HashMap<String, String>, // key: "component.metric", value: status
last_notification: HashMap<String, DateTime<Utc>>, // Rate limiting
}
impl NotificationManager {
pub fn new(config: NotificationConfig) -> Self {
Self {
config,
last_status: HashMap::new(),
last_notification: HashMap::new(),
}
}
pub fn update_status(&mut self, component: &str, metric: &str, status: &str) -> Option<StatusChange> {
let key = format!("{}.{}", component, metric);
let old_status = self.last_status.get(&key).cloned();
if let Some(old) = &old_status {
if old != status {
let change = StatusChange {
component: component.to_string(),
metric: metric.to_string(),
old_status: old.clone(),
new_status: status.to_string(),
timestamp: Utc::now(),
};
self.last_status.insert(key, status.to_string());
if self.should_notify(&change) {
return Some(change);
}
}
} else {
// First time seeing this metric - store but don't notify
self.last_status.insert(key, status.to_string());
}
None
}
fn should_notify(&mut self, change: &StatusChange) -> bool {
if !self.config.enabled {
return false;
}
// Only notify on transitions to warning/critical, or recovery to ok
match (change.old_status.as_str(), change.new_status.as_str()) {
(_, "warning") | (_, "critical") => true,
("warning" | "critical", "ok") => true,
_ => false,
}
}
fn is_rate_limited(&mut self, change: &StatusChange) -> bool {
let key = format!("{}.{}", change.component, change.metric);
if let Some(last_time) = self.last_notification.get(&key) {
let minutes_since = Utc::now().signed_duration_since(*last_time).num_minutes();
if minutes_since < self.config.rate_limit_minutes as i64 {
return true;
}
}
self.last_notification.insert(key, Utc::now());
false
}
pub async fn send_notification(&mut self, change: StatusChange) {
if !self.config.enabled {
return;
}
if self.is_rate_limited(&change) {
warn!("Rate limiting notification for {}.{}", change.component, change.metric);
return;
}
let subject = self.format_subject(&change);
let body = self.format_body(&change);
if let Err(e) = self.send_email(&subject, &body).await {
error!("Failed to send notification email: {}", e);
} else {
info!("Sent notification: {} {}.{} {} → {}",
change.component, change.component, change.metric,
change.old_status, change.new_status);
}
}
fn format_subject(&self, change: &StatusChange) -> String {
let urgency = match change.new_status.as_str() {
"critical" => "🔴 CRITICAL",
"warning" => "🟡 WARNING",
"ok" => "✅ RESOLVED",
_ => " STATUS",
};
format!("{}: {} {} on {}",
urgency,
change.component,
change.metric,
gethostname::gethostname().to_string_lossy())
}
fn format_body(&self, change: &StatusChange) -> String {
format!(
"Status Change Alert\n\
\n\
Host: {}\n\
Component: {}\n\
Metric: {}\n\
Status Change: {}{}\n\
Time: {}\n\
\n\
--\n\
CM Dashboard Agent\n\
Generated at {}",
gethostname::gethostname().to_string_lossy(),
change.component,
change.metric,
change.old_status,
change.new_status,
change.timestamp.format("%Y-%m-%d %H:%M:%S UTC"),
Utc::now().format("%Y-%m-%d %H:%M:%S UTC")
)
}
async fn send_email(&self, subject: &str, body: &str) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let email = Message::builder()
.from(self.config.from_email.parse()?)
.to(self.config.to_email.parse()?)
.subject(subject)
.body(body.to_string())?;
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
.port(self.config.smtp_port)
.build();
mailer.send(&email)?;
Ok(())
}
}