Implement comprehensive monitoring improvements

- Add full email notifications with lettre and Stockholm timezone
- Add status persistence to prevent notification spam on restart
- Change nginx monitoring to check backend proxy_pass URLs instead of frontend domains
- Increase nginx site timeout to 10 seconds for backend health checks
- Fix cache intervals: disk (5min), backup (10min), systemd (30s), cpu/memory (5s)
- Remove rate limiting for immediate notifications on all status changes
- Store metric status in /var/lib/cm-dashboard/last-status.json
This commit is contained in:
Christoffer Martinsson 2025-10-20 14:32:44 +02:00
parent ecaf3aedb5
commit 66a79574e0
5 changed files with 260 additions and 95 deletions

View File

@ -17,6 +17,7 @@ tracing = { workspace = true }
tracing-subscriber = { workspace = true } tracing-subscriber = { workspace = true }
lettre = { workspace = true } lettre = { workspace = true }
gethostname = { workspace = true } gethostname = { workspace = true }
chrono-tz = "0.8"
toml = { workspace = true } toml = { workspace = true }
async-trait = "0.1" async-trait = "0.1"
reqwest = { version = "0.11", features = ["json", "blocking"] } reqwest = { version = "0.11", features = ["json", "blocking"] }

View File

@ -595,8 +595,8 @@ impl SystemdCollector {
// Create HTTP client with timeouts (similar to legacy implementation) // Create HTTP client with timeouts (similar to legacy implementation)
let client = reqwest::blocking::Client::builder() let client = reqwest::blocking::Client::builder()
.timeout(Duration::from_secs(5)) .timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(2)) .connect_timeout(Duration::from_secs(10))
.redirect(reqwest::redirect::Policy::limited(10)) .redirect(reqwest::redirect::Policy::limited(10))
.build()?; .build()?;
@ -739,12 +739,9 @@ impl SystemdCollector {
while i < lines.len() { while i < lines.len() {
let line = lines[i].trim(); let line = lines[i].trim();
if line.starts_with("server") && line.contains("{") { if line.starts_with("server") && line.contains("{") {
debug!("Found server block at line {}", i); if let Some(proxy_url) = self.parse_server_block(&lines, &mut i) {
if let Some(server_name) = self.parse_server_block(&lines, &mut i) { let site_name = proxy_url.replace("http://", "").replace("https://", "");
debug!("Extracted server name: {}", server_name); sites.push((site_name, proxy_url));
let url = format!("https://{}", server_name);
// Use the full domain as the site name for clarity
sites.push((server_name.clone(), url));
} }
} }
i += 1; i += 1;
@ -758,6 +755,7 @@ impl SystemdCollector {
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> { fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
use tracing::debug; use tracing::debug;
let mut server_names = Vec::new(); let mut server_names = Vec::new();
let mut proxy_pass_url = None;
let mut has_redirect = false; let mut has_redirect = false;
let mut i = *start_index + 1; let mut i = *start_index + 1;
let mut brace_count = 1; let mut brace_count = 1;
@ -787,6 +785,17 @@ impl SystemdCollector {
} }
} }
// Extract proxy_pass URL (backend IP:port)
if trimmed.starts_with("proxy_pass") {
if let Some(url_part) = trimmed.strip_prefix("proxy_pass") {
let url_clean = url_part.trim().trim_end_matches(';');
if !url_clean.is_empty() {
proxy_pass_url = Some(url_clean.to_string());
debug!("Found proxy_pass in block: {}", url_clean);
}
}
}
// Check for redirects (skip redirect-only servers) // Check for redirects (skip redirect-only servers)
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) { if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
has_redirect = true; has_redirect = true;
@ -797,11 +806,12 @@ impl SystemdCollector {
*start_index = i - 1; *start_index = i - 1;
// Only return hostnames that are not redirects and have actual content if let Some(proxy_url) = proxy_pass_url {
if !server_names.is_empty() && !has_redirect { if !has_redirect {
Some(server_names[0].clone()) return Some(proxy_url);
} else { }
None
} }
None
} }
} }

View File

@ -46,10 +46,9 @@ pub const DEFAULT_SMART_WEAR_CRITICAL: f32 = 90.0;
// Backup configuration // Backup configuration
pub const DEFAULT_BACKUP_MAX_AGE_HOURS: u64 = 48; pub const DEFAULT_BACKUP_MAX_AGE_HOURS: u64 = 48;
// Notification configuration (from legacy) // Notification configuration (from legacy)
pub const DEFAULT_SMTP_HOST: &str = "localhost"; pub const DEFAULT_SMTP_HOST: &str = "localhost";
pub const DEFAULT_SMTP_PORT: u16 = 25; pub const DEFAULT_SMTP_PORT: u16 = 25;
pub const DEFAULT_FROM_EMAIL: &str = "{hostname}@cmtec.se"; pub const DEFAULT_FROM_EMAIL: &str = "{hostname}@cmtec.se";
pub const DEFAULT_TO_EMAIL: &str = "cm@cmtec.se"; pub const DEFAULT_TO_EMAIL: &str = "cm@cmtec.se";
pub const DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES: u64 = 30; pub const DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES: u64 = 0;

View File

@ -182,11 +182,13 @@ impl MetricCollectionManager {
for collector in &self.collectors { for collector in &self.collectors {
let collector_name = collector.name(); let collector_name = collector.name();
// Determine cache interval for this collector type - ALL REALTIME FOR FAST UPDATES // Determine cache interval for this collector type based on data volatility
let cache_interval_secs = match collector_name { let cache_interval_secs = match collector_name {
"cpu" | "memory" | "disk" | "systemd" => 2, // All realtime for fast updates "cpu" | "memory" => 5, // Fast updates for volatile metrics
"backup" => 10, // Backup metrics every 10 seconds for testing "systemd" => 30, // Service status changes less frequently
_ => 2, // All realtime for fast updates "disk" => 300, // SMART data changes very slowly (5 minutes)
"backup" => 600, // Backup status changes rarely (10 minutes)
_ => 30, // Default: moderate frequency
}; };
let should_collect = let should_collect =

View File

@ -1,16 +1,29 @@
use cm_dashboard_shared::Status; use cm_dashboard_shared::Status;
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Instant; use std::fs;
use tracing::{info, debug}; use std::path::Path;
use tracing::{debug, info, error, warn};
use chrono::{DateTime, Utc};
use chrono_tz::Europe::Stockholm;
use lettre::{Message, SmtpTransport, Transport};
use serde::{Serialize, Deserialize};
use crate::config::NotificationConfig; use crate::config::NotificationConfig;
/// Persisted status data
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PersistedStatus {
metric_statuses: HashMap<String, Status>,
metric_details: HashMap<String, String>,
}
/// Manages status change tracking and notifications /// Manages status change tracking and notifications
pub struct NotificationManager { pub struct NotificationManager {
config: NotificationConfig, config: NotificationConfig,
hostname: String, hostname: String,
metric_statuses: HashMap<String, Status>, metric_statuses: HashMap<String, Status>,
last_notification_times: HashMap<String, Instant>, metric_details: HashMap<String, String>, // Store details for warning/critical states
status_file_path: String,
} }
/// Status change information /// Status change information
@ -19,77 +32,141 @@ pub struct StatusChange {
pub metric_name: String, pub metric_name: String,
pub old_status: Status, pub old_status: Status,
pub new_status: Status, pub new_status: Status,
pub timestamp: Instant, pub timestamp: DateTime<Utc>,
pub details: Option<String>,
} }
impl NotificationManager { impl NotificationManager {
pub fn new(config: &NotificationConfig, hostname: &str) -> Result<Self, anyhow::Error> { pub fn new(config: &NotificationConfig, hostname: &str) -> Result<Self, anyhow::Error> {
info!("Initializing notification manager for {}", hostname); info!("Initializing notification manager for {}", hostname);
let status_file_path = "/var/lib/cm-dashboard/last-status.json".to_string();
// Create directory if it doesn't exist
if let Some(parent) = Path::new(&status_file_path).parent() {
if let Err(e) = fs::create_dir_all(parent) {
warn!("Failed to create status directory {}: {}", parent.display(), e);
}
}
// Load previous status from disk
let (metric_statuses, metric_details) = Self::load_status(&status_file_path);
Ok(Self { Ok(Self {
config: config.clone(), config: config.clone(),
hostname: hostname.to_string(), hostname: hostname.to_string(),
metric_statuses: HashMap::new(), metric_statuses,
last_notification_times: HashMap::new(), metric_details,
status_file_path,
}) })
} }
/// Update metric status and return status change if any /// Update metric status and return status change if any
pub fn update_metric_status(&mut self, metric_name: &str, new_status: Status) -> Option<StatusChange> { pub fn update_metric_status(
let old_status = self.metric_statuses.get(metric_name).copied().unwrap_or(Status::Unknown); &mut self,
metric_name: &str,
// Update stored status new_status: Status,
self.metric_statuses.insert(metric_name.to_string(), new_status); ) -> Option<StatusChange> {
let old_status = self
.metric_statuses
.get(metric_name)
.copied()
.unwrap_or(Status::Unknown);
// Check if status actually changed // Check if status actually changed
if old_status != new_status { if old_status != new_status {
debug!("Status change detected for {}: {:?} -> {:?}", metric_name, old_status, new_status); // Update stored status only on change
self.metric_statuses
.insert(metric_name.to_string(), new_status);
// Save status to disk only when status changes
self.save_status();
debug!(
"Status change detected for {}: {:?} -> {:?}",
metric_name, old_status, new_status
);
Some(StatusChange { Some(StatusChange {
metric_name: metric_name.to_string(), metric_name: metric_name.to_string(),
old_status, old_status,
new_status, new_status,
timestamp: Instant::now(), timestamp: Utc::now(),
details: None, // Will be populated when needed
}) })
} else { } else {
// No status change - update stored status but don't save to disk
self.metric_statuses
.insert(metric_name.to_string(), new_status);
None None
} }
} }
/// Send notification for status change (placeholder implementation) /// Send notification for status change
pub async fn send_status_change_notification( pub async fn send_status_change_notification(
&mut self, &mut self,
status_change: StatusChange, mut status_change: StatusChange,
metric: &cm_dashboard_shared::Metric, metric: &cm_dashboard_shared::Metric,
) -> Result<(), anyhow::Error> { ) -> Result<(), anyhow::Error> {
if !self.config.enabled { if !self.config.enabled {
return Ok(()); return Ok(());
} }
// Check rate limiting // Only notify on transitions to warning/critical, or recovery to ok
if self.is_rate_limited(&status_change.metric_name) { let should_send = match (status_change.old_status, status_change.new_status) {
debug!("Notification rate limited for {}", status_change.metric_name); (_, Status::Warning) | (_, Status::Critical) => true,
(Status::Warning | Status::Critical, Status::Ok) => true,
_ => false,
};
if !should_send {
return Ok(()); return Ok(());
} }
// Check maintenance mode // Check maintenance mode
if self.is_maintenance_mode() { if self.is_maintenance_mode() {
debug!("Maintenance mode active, suppressing notification for {}", status_change.metric_name); debug!(
"Maintenance mode active, suppressing notification for {}",
status_change.metric_name
);
return Ok(()); return Ok(());
} }
info!("Would send notification for {}: {:?} -> {:?}",
status_change.metric_name, status_change.old_status, status_change.new_status);
// TODO: Implement actual email sending using lettre // Add metric details to status change
// For now, just log the notification status_change.details = Some(self.format_metric_details(metric));
self.log_notification(&status_change, metric);
// For recovery notifications, include original problem details
if status_change.new_status == Status::Ok &&
(status_change.old_status == Status::Warning || status_change.old_status == Status::Critical) {
if let Some(old_details) = self.metric_details.get(&status_change.metric_name) {
status_change.details = Some(format!(
"Recovered from: {}\nCurrent status: {}",
old_details,
status_change.details.unwrap_or_default()
));
}
// Clear stored details after recovery
self.metric_details.remove(&status_change.metric_name);
} else if status_change.new_status == Status::Warning || status_change.new_status == Status::Critical {
// Store details for warning/critical states
if let Some(ref details) = status_change.details {
self.metric_details.insert(status_change.metric_name.clone(), details.clone());
}
}
// Save status after updating details
self.save_status();
// Send the actual email
if let Err(e) = self.send_email(&status_change).await {
error!("Failed to send notification email: {}", e);
} else {
info!(
"Sent notification: {} {:?} → {:?}",
status_change.metric_name, status_change.old_status, status_change.new_status
);
}
// Update last notification time
self.last_notification_times.insert(
status_change.metric_name.clone(),
status_change.timestamp
);
Ok(()) Ok(())
} }
@ -99,39 +176,74 @@ impl NotificationManager {
std::fs::metadata("/tmp/cm-maintenance").is_ok() std::fs::metadata("/tmp/cm-maintenance").is_ok()
} }
/// Check if notification is rate limited
fn is_rate_limited(&self, metric_name: &str) -> bool {
if self.config.rate_limit_minutes == 0 {
return false; // No rate limiting
}
if let Some(last_time) = self.last_notification_times.get(metric_name) { /// Format metric details for notification
let elapsed = last_time.elapsed(); fn format_metric_details(&self, metric: &cm_dashboard_shared::Metric) -> String {
let rate_limit_duration = std::time::Duration::from_secs(self.config.rate_limit_minutes * 60); format!("Value: {}", metric.value.as_string())
elapsed < rate_limit_duration
} else {
false // No previous notification
}
} }
/// Log notification details /// Format email subject
fn log_notification(&self, status_change: &StatusChange, metric: &cm_dashboard_shared::Metric) { fn format_subject(&self, change: &StatusChange) -> String {
let status_description = match status_change.new_status { let urgency = match change.new_status {
Status::Ok => "recovered", Status::Critical => "🔴 CRITICAL",
Status::Warning => "warning", Status::Warning => "🟡 WARNING",
Status::Critical => "critical", Status::Ok => "✅ RESOLVED",
Status::Unknown => "unknown", Status::Unknown => " STATUS",
}; };
info!( format!("{}: {} on {}", urgency, change.metric_name, self.hostname)
"NOTIFICATION: {} on {}: {} is {} (value: {})", }
status_description,
/// Format email body
fn format_body(&self, change: &StatusChange) -> String {
let mut body = format!(
"Status Change Alert\n\
\n\
Host: {}\n\
Metric: {}\n\
Status Change: {:?} {:?}\n\
Time: {}",
self.hostname, self.hostname,
status_change.metric_name, change.metric_name,
status_description, change.old_status,
metric.value.as_string() change.new_status,
change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
); );
if let Some(details) = &change.details {
body.push_str(&format!("\n\nDetails:\n{}", details));
}
body.push_str(&format!(
"\n\n--\n\
CM Dashboard Agent\n\
Generated at {}",
Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
));
body
}
/// Send email notification
async fn send_email(&self, change: &StatusChange) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let subject = self.format_subject(change);
let body = self.format_body(change);
// Replace {hostname} placeholder in from_email
let from_email = self.config.from_email.replace("{hostname}", &self.hostname);
let email = Message::builder()
.from(from_email.parse()?)
.to(self.config.to_email.parse()?)
.subject(subject)
.body(body)?;
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
.port(self.config.smtp_port)
.build();
mailer.send(&email)?;
Ok(())
} }
/// Process any pending notifications (placeholder) /// Process any pending notifications (placeholder)
@ -140,6 +252,47 @@ impl NotificationManager {
// Could be used for email queue processing, etc. // Could be used for email queue processing, etc.
} }
/// Load status from disk
fn load_status(file_path: &str) -> (HashMap<String, Status>, HashMap<String, String>) {
match fs::read_to_string(file_path) {
Ok(content) => {
match serde_json::from_str::<PersistedStatus>(&content) {
Ok(persisted) => {
info!("Loaded {} metric statuses from {}", persisted.metric_statuses.len(), file_path);
(persisted.metric_statuses, persisted.metric_details)
}
Err(e) => {
warn!("Failed to parse status file {}: {}", file_path, e);
(HashMap::new(), HashMap::new())
}
}
}
Err(_) => {
info!("No previous status file found at {}, starting fresh", file_path);
(HashMap::new(), HashMap::new())
}
}
}
/// Save status to disk
fn save_status(&self) {
let persisted = PersistedStatus {
metric_statuses: self.metric_statuses.clone(),
metric_details: self.metric_details.clone(),
};
match serde_json::to_string_pretty(&persisted) {
Ok(content) => {
if let Err(e) = fs::write(&self.status_file_path, content) {
warn!("Failed to save status to {}: {}", self.status_file_path, e);
}
}
Err(e) => {
warn!("Failed to serialize status: {}", e);
}
}
}
/// Get current metric statuses /// Get current metric statuses
pub fn get_metric_statuses(&self) -> &HashMap<String, Status> { pub fn get_metric_statuses(&self) -> &HashMap<String, Status> {
&self.metric_statuses &self.metric_statuses