From 66a79574e0f4380be907a0707dce65309eaacb37 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Mon, 20 Oct 2025 14:32:44 +0200 Subject: [PATCH] Implement comprehensive monitoring improvements - Add full email notifications with lettre and Stockholm timezone - Add status persistence to prevent notification spam on restart - Change nginx monitoring to check backend proxy_pass URLs instead of frontend domains - Increase nginx site timeout to 10 seconds for backend health checks - Fix cache intervals: disk (5min), backup (10min), systemd (30s), cpu/memory (5s) - Remove rate limiting for immediate notifications on all status changes - Store metric status in /var/lib/cm-dashboard/last-status.json --- agent/Cargo.toml | 1 + agent/src/collectors/systemd.rs | 36 ++-- agent/src/config/defaults.rs | 5 +- agent/src/metrics/mod.rs | 10 +- agent/src/notifications/mod.rs | 303 ++++++++++++++++++++++++-------- 5 files changed, 260 insertions(+), 95 deletions(-) diff --git a/agent/Cargo.toml b/agent/Cargo.toml index e3138a1..4b99a72 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -17,6 +17,7 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true } lettre = { workspace = true } gethostname = { workspace = true } +chrono-tz = "0.8" toml = { workspace = true } async-trait = "0.1" reqwest = { version = "0.11", features = ["json", "blocking"] } \ No newline at end of file diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index f395568..8acf9c8 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -595,8 +595,8 @@ impl SystemdCollector { // Create HTTP client with timeouts (similar to legacy implementation) let client = reqwest::blocking::Client::builder() - .timeout(Duration::from_secs(5)) - .connect_timeout(Duration::from_secs(2)) + .timeout(Duration::from_secs(10)) + .connect_timeout(Duration::from_secs(10)) .redirect(reqwest::redirect::Policy::limited(10)) .build()?; @@ -739,12 +739,9 @@ impl SystemdCollector { while i < lines.len() { let line = lines[i].trim(); if line.starts_with("server") && line.contains("{") { - debug!("Found server block at line {}", i); - if let Some(server_name) = self.parse_server_block(&lines, &mut i) { - debug!("Extracted server name: {}", server_name); - let url = format!("https://{}", server_name); - // Use the full domain as the site name for clarity - sites.push((server_name.clone(), url)); + if let Some(proxy_url) = self.parse_server_block(&lines, &mut i) { + let site_name = proxy_url.replace("http://", "").replace("https://", ""); + sites.push((site_name, proxy_url)); } } i += 1; @@ -758,6 +755,7 @@ impl SystemdCollector { fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option { use tracing::debug; let mut server_names = Vec::new(); + let mut proxy_pass_url = None; let mut has_redirect = false; let mut i = *start_index + 1; let mut brace_count = 1; @@ -787,6 +785,17 @@ impl SystemdCollector { } } + // Extract proxy_pass URL (backend IP:port) + if trimmed.starts_with("proxy_pass") { + if let Some(url_part) = trimmed.strip_prefix("proxy_pass") { + let url_clean = url_part.trim().trim_end_matches(';'); + if !url_clean.is_empty() { + proxy_pass_url = Some(url_clean.to_string()); + debug!("Found proxy_pass in block: {}", url_clean); + } + } + } + // Check for redirects (skip redirect-only servers) if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) { has_redirect = true; @@ -797,11 +806,12 @@ impl SystemdCollector { *start_index = i - 1; - // Only return hostnames that are not redirects and have actual content - if !server_names.is_empty() && !has_redirect { - Some(server_names[0].clone()) - } else { - None + if let Some(proxy_url) = proxy_pass_url { + if !has_redirect { + return Some(proxy_url); + } } + + None } } diff --git a/agent/src/config/defaults.rs b/agent/src/config/defaults.rs index 77c42ef..c0eab29 100644 --- a/agent/src/config/defaults.rs +++ b/agent/src/config/defaults.rs @@ -1,4 +1,4 @@ -// Collection intervals +// Collection intervals pub const DEFAULT_COLLECTION_INTERVAL_SECONDS: u64 = 2; pub const DEFAULT_CPU_INTERVAL_SECONDS: u64 = 5; pub const DEFAULT_MEMORY_INTERVAL_SECONDS: u64 = 5; @@ -46,10 +46,9 @@ pub const DEFAULT_SMART_WEAR_CRITICAL: f32 = 90.0; // Backup configuration pub const DEFAULT_BACKUP_MAX_AGE_HOURS: u64 = 48; - // Notification configuration (from legacy) pub const DEFAULT_SMTP_HOST: &str = "localhost"; pub const DEFAULT_SMTP_PORT: u16 = 25; pub const DEFAULT_FROM_EMAIL: &str = "{hostname}@cmtec.se"; pub const DEFAULT_TO_EMAIL: &str = "cm@cmtec.se"; -pub const DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES: u64 = 30; \ No newline at end of file +pub const DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES: u64 = 0; diff --git a/agent/src/metrics/mod.rs b/agent/src/metrics/mod.rs index 6b78cce..8fb41a6 100644 --- a/agent/src/metrics/mod.rs +++ b/agent/src/metrics/mod.rs @@ -182,11 +182,13 @@ impl MetricCollectionManager { for collector in &self.collectors { let collector_name = collector.name(); - // Determine cache interval for this collector type - ALL REALTIME FOR FAST UPDATES + // Determine cache interval for this collector type based on data volatility let cache_interval_secs = match collector_name { - "cpu" | "memory" | "disk" | "systemd" => 2, // All realtime for fast updates - "backup" => 10, // Backup metrics every 10 seconds for testing - _ => 2, // All realtime for fast updates + "cpu" | "memory" => 5, // Fast updates for volatile metrics + "systemd" => 30, // Service status changes less frequently + "disk" => 300, // SMART data changes very slowly (5 minutes) + "backup" => 600, // Backup status changes rarely (10 minutes) + _ => 30, // Default: moderate frequency }; let should_collect = diff --git a/agent/src/notifications/mod.rs b/agent/src/notifications/mod.rs index 0eafed4..141dc1a 100644 --- a/agent/src/notifications/mod.rs +++ b/agent/src/notifications/mod.rs @@ -1,16 +1,29 @@ use cm_dashboard_shared::Status; use std::collections::HashMap; -use std::time::Instant; -use tracing::{info, debug}; +use std::fs; +use std::path::Path; +use tracing::{debug, info, error, warn}; +use chrono::{DateTime, Utc}; +use chrono_tz::Europe::Stockholm; +use lettre::{Message, SmtpTransport, Transport}; +use serde::{Serialize, Deserialize}; use crate::config::NotificationConfig; +/// Persisted status data +#[derive(Debug, Clone, Serialize, Deserialize)] +struct PersistedStatus { + metric_statuses: HashMap, + metric_details: HashMap, +} + /// Manages status change tracking and notifications pub struct NotificationManager { config: NotificationConfig, hostname: String, metric_statuses: HashMap, - last_notification_times: HashMap, + metric_details: HashMap, // Store details for warning/critical states + status_file_path: String, } /// Status change information @@ -19,129 +32,269 @@ pub struct StatusChange { pub metric_name: String, pub old_status: Status, pub new_status: Status, - pub timestamp: Instant, + pub timestamp: DateTime, + pub details: Option, } impl NotificationManager { pub fn new(config: &NotificationConfig, hostname: &str) -> Result { info!("Initializing notification manager for {}", hostname); + + let status_file_path = "/var/lib/cm-dashboard/last-status.json".to_string(); + // Create directory if it doesn't exist + if let Some(parent) = Path::new(&status_file_path).parent() { + if let Err(e) = fs::create_dir_all(parent) { + warn!("Failed to create status directory {}: {}", parent.display(), e); + } + } + + // Load previous status from disk + let (metric_statuses, metric_details) = Self::load_status(&status_file_path); + Ok(Self { config: config.clone(), hostname: hostname.to_string(), - metric_statuses: HashMap::new(), - last_notification_times: HashMap::new(), + metric_statuses, + metric_details, + status_file_path, }) } - + /// Update metric status and return status change if any - pub fn update_metric_status(&mut self, metric_name: &str, new_status: Status) -> Option { - let old_status = self.metric_statuses.get(metric_name).copied().unwrap_or(Status::Unknown); - - // Update stored status - self.metric_statuses.insert(metric_name.to_string(), new_status); - + pub fn update_metric_status( + &mut self, + metric_name: &str, + new_status: Status, + ) -> Option { + let old_status = self + .metric_statuses + .get(metric_name) + .copied() + .unwrap_or(Status::Unknown); + // Check if status actually changed if old_status != new_status { - debug!("Status change detected for {}: {:?} -> {:?}", metric_name, old_status, new_status); - + // Update stored status only on change + self.metric_statuses + .insert(metric_name.to_string(), new_status); + + // Save status to disk only when status changes + self.save_status(); + debug!( + "Status change detected for {}: {:?} -> {:?}", + metric_name, old_status, new_status + ); + Some(StatusChange { metric_name: metric_name.to_string(), old_status, new_status, - timestamp: Instant::now(), + timestamp: Utc::now(), + details: None, // Will be populated when needed }) } else { + // No status change - update stored status but don't save to disk + self.metric_statuses + .insert(metric_name.to_string(), new_status); None } } - - /// Send notification for status change (placeholder implementation) + + /// Send notification for status change pub async fn send_status_change_notification( &mut self, - status_change: StatusChange, + mut status_change: StatusChange, metric: &cm_dashboard_shared::Metric, ) -> Result<(), anyhow::Error> { if !self.config.enabled { return Ok(()); } - - // Check rate limiting - if self.is_rate_limited(&status_change.metric_name) { - debug!("Notification rate limited for {}", status_change.metric_name); + + // Only notify on transitions to warning/critical, or recovery to ok + let should_send = match (status_change.old_status, status_change.new_status) { + (_, Status::Warning) | (_, Status::Critical) => true, + (Status::Warning | Status::Critical, Status::Ok) => true, + _ => false, + }; + + if !should_send { return Ok(()); } - + // Check maintenance mode if self.is_maintenance_mode() { - debug!("Maintenance mode active, suppressing notification for {}", status_change.metric_name); + debug!( + "Maintenance mode active, suppressing notification for {}", + status_change.metric_name + ); return Ok(()); } - - info!("Would send notification for {}: {:?} -> {:?}", - status_change.metric_name, status_change.old_status, status_change.new_status); - - // TODO: Implement actual email sending using lettre - // For now, just log the notification - self.log_notification(&status_change, metric); - - // Update last notification time - self.last_notification_times.insert( - status_change.metric_name.clone(), - status_change.timestamp - ); - + + + // Add metric details to status change + status_change.details = Some(self.format_metric_details(metric)); + + // For recovery notifications, include original problem details + if status_change.new_status == Status::Ok && + (status_change.old_status == Status::Warning || status_change.old_status == Status::Critical) { + if let Some(old_details) = self.metric_details.get(&status_change.metric_name) { + status_change.details = Some(format!( + "Recovered from: {}\nCurrent status: {}", + old_details, + status_change.details.unwrap_or_default() + )); + } + // Clear stored details after recovery + self.metric_details.remove(&status_change.metric_name); + } else if status_change.new_status == Status::Warning || status_change.new_status == Status::Critical { + // Store details for warning/critical states + if let Some(ref details) = status_change.details { + self.metric_details.insert(status_change.metric_name.clone(), details.clone()); + } + } + + // Save status after updating details + self.save_status(); + + // Send the actual email + if let Err(e) = self.send_email(&status_change).await { + error!("Failed to send notification email: {}", e); + } else { + info!( + "Sent notification: {} {:?} → {:?}", + status_change.metric_name, status_change.old_status, status_change.new_status + ); + } + + Ok(()) } - + /// Check if maintenance mode is active fn is_maintenance_mode(&self) -> bool { std::fs::metadata("/tmp/cm-maintenance").is_ok() } - - /// Check if notification is rate limited - fn is_rate_limited(&self, metric_name: &str) -> bool { - if self.config.rate_limit_minutes == 0 { - return false; // No rate limiting - } - - if let Some(last_time) = self.last_notification_times.get(metric_name) { - let elapsed = last_time.elapsed(); - let rate_limit_duration = std::time::Duration::from_secs(self.config.rate_limit_minutes * 60); - - elapsed < rate_limit_duration - } else { - false // No previous notification - } + + + /// Format metric details for notification + fn format_metric_details(&self, metric: &cm_dashboard_shared::Metric) -> String { + format!("Value: {}", metric.value.as_string()) } - - /// Log notification details - fn log_notification(&self, status_change: &StatusChange, metric: &cm_dashboard_shared::Metric) { - let status_description = match status_change.new_status { - Status::Ok => "recovered", - Status::Warning => "warning", - Status::Critical => "critical", - Status::Unknown => "unknown", + + /// Format email subject + fn format_subject(&self, change: &StatusChange) -> String { + let urgency = match change.new_status { + Status::Critical => "🔴 CRITICAL", + Status::Warning => "🟡 WARNING", + Status::Ok => "✅ RESOLVED", + Status::Unknown => "â„šī¸ STATUS", }; - - info!( - "NOTIFICATION: {} on {}: {} is {} (value: {})", - status_description, - self.hostname, - status_change.metric_name, - status_description, - metric.value.as_string() - ); + + format!("{}: {} on {}", urgency, change.metric_name, self.hostname) } - + + /// Format email body + fn format_body(&self, change: &StatusChange) -> String { + let mut body = format!( + "Status Change Alert\n\ + \n\ + Host: {}\n\ + Metric: {}\n\ + Status Change: {:?} → {:?}\n\ + Time: {}", + self.hostname, + change.metric_name, + change.old_status, + change.new_status, + change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST") + ); + + if let Some(details) = &change.details { + body.push_str(&format!("\n\nDetails:\n{}", details)); + } + + body.push_str(&format!( + "\n\n--\n\ + CM Dashboard Agent\n\ + Generated at {}", + Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST") + )); + + body + } + + /// Send email notification + async fn send_email(&self, change: &StatusChange) -> Result<(), Box> { + let subject = self.format_subject(change); + let body = self.format_body(change); + + // Replace {hostname} placeholder in from_email + let from_email = self.config.from_email.replace("{hostname}", &self.hostname); + + let email = Message::builder() + .from(from_email.parse()?) + .to(self.config.to_email.parse()?) + .subject(subject) + .body(body)?; + + let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host) + .port(self.config.smtp_port) + .build(); + + mailer.send(&email)?; + Ok(()) + } + /// Process any pending notifications (placeholder) pub async fn process_pending(&mut self) { // Placeholder for batch notification processing // Could be used for email queue processing, etc. } - + + /// Load status from disk + fn load_status(file_path: &str) -> (HashMap, HashMap) { + match fs::read_to_string(file_path) { + Ok(content) => { + match serde_json::from_str::(&content) { + Ok(persisted) => { + info!("Loaded {} metric statuses from {}", persisted.metric_statuses.len(), file_path); + (persisted.metric_statuses, persisted.metric_details) + } + Err(e) => { + warn!("Failed to parse status file {}: {}", file_path, e); + (HashMap::new(), HashMap::new()) + } + } + } + Err(_) => { + info!("No previous status file found at {}, starting fresh", file_path); + (HashMap::new(), HashMap::new()) + } + } + } + + /// Save status to disk + fn save_status(&self) { + let persisted = PersistedStatus { + metric_statuses: self.metric_statuses.clone(), + metric_details: self.metric_details.clone(), + }; + + match serde_json::to_string_pretty(&persisted) { + Ok(content) => { + if let Err(e) = fs::write(&self.status_file_path, content) { + warn!("Failed to save status to {}: {}", self.status_file_path, e); + } + } + Err(e) => { + warn!("Failed to serialize status: {}", e); + } + } + } + /// Get current metric statuses pub fn get_metric_statuses(&self) -> &HashMap { &self.metric_statuses } -} \ No newline at end of file +}