Implement comprehensive monitoring improvements

- Add full email notifications with lettre and Stockholm timezone
- Add status persistence to prevent notification spam on restart
- Change nginx monitoring to check backend proxy_pass URLs instead of frontend domains
- Increase nginx site timeout to 10 seconds for backend health checks
- Fix cache intervals: disk (5min), backup (10min), systemd (30s), cpu/memory (5s)
- Remove rate limiting for immediate notifications on all status changes
- Store metric status in /var/lib/cm-dashboard/last-status.json
This commit is contained in:
2025-10-20 14:32:44 +02:00
parent ecaf3aedb5
commit 66a79574e0
5 changed files with 260 additions and 95 deletions

View File

@@ -1,16 +1,29 @@
use cm_dashboard_shared::Status;
use std::collections::HashMap;
use std::time::Instant;
use tracing::{info, debug};
use std::fs;
use std::path::Path;
use tracing::{debug, info, error, warn};
use chrono::{DateTime, Utc};
use chrono_tz::Europe::Stockholm;
use lettre::{Message, SmtpTransport, Transport};
use serde::{Serialize, Deserialize};
use crate::config::NotificationConfig;
/// Persisted status data
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PersistedStatus {
metric_statuses: HashMap<String, Status>,
metric_details: HashMap<String, String>,
}
/// Manages status change tracking and notifications
pub struct NotificationManager {
config: NotificationConfig,
hostname: String,
metric_statuses: HashMap<String, Status>,
last_notification_times: HashMap<String, Instant>,
metric_details: HashMap<String, String>, // Store details for warning/critical states
status_file_path: String,
}
/// Status change information
@@ -19,129 +32,269 @@ pub struct StatusChange {
pub metric_name: String,
pub old_status: Status,
pub new_status: Status,
pub timestamp: Instant,
pub timestamp: DateTime<Utc>,
pub details: Option<String>,
}
impl NotificationManager {
pub fn new(config: &NotificationConfig, hostname: &str) -> Result<Self, anyhow::Error> {
info!("Initializing notification manager for {}", hostname);
let status_file_path = "/var/lib/cm-dashboard/last-status.json".to_string();
// Create directory if it doesn't exist
if let Some(parent) = Path::new(&status_file_path).parent() {
if let Err(e) = fs::create_dir_all(parent) {
warn!("Failed to create status directory {}: {}", parent.display(), e);
}
}
// Load previous status from disk
let (metric_statuses, metric_details) = Self::load_status(&status_file_path);
Ok(Self {
config: config.clone(),
hostname: hostname.to_string(),
metric_statuses: HashMap::new(),
last_notification_times: HashMap::new(),
metric_statuses,
metric_details,
status_file_path,
})
}
/// Update metric status and return status change if any
pub fn update_metric_status(&mut self, metric_name: &str, new_status: Status) -> Option<StatusChange> {
let old_status = self.metric_statuses.get(metric_name).copied().unwrap_or(Status::Unknown);
// Update stored status
self.metric_statuses.insert(metric_name.to_string(), new_status);
pub fn update_metric_status(
&mut self,
metric_name: &str,
new_status: Status,
) -> Option<StatusChange> {
let old_status = self
.metric_statuses
.get(metric_name)
.copied()
.unwrap_or(Status::Unknown);
// Check if status actually changed
if old_status != new_status {
debug!("Status change detected for {}: {:?} -> {:?}", metric_name, old_status, new_status);
// Update stored status only on change
self.metric_statuses
.insert(metric_name.to_string(), new_status);
// Save status to disk only when status changes
self.save_status();
debug!(
"Status change detected for {}: {:?} -> {:?}",
metric_name, old_status, new_status
);
Some(StatusChange {
metric_name: metric_name.to_string(),
old_status,
new_status,
timestamp: Instant::now(),
timestamp: Utc::now(),
details: None, // Will be populated when needed
})
} else {
// No status change - update stored status but don't save to disk
self.metric_statuses
.insert(metric_name.to_string(), new_status);
None
}
}
/// Send notification for status change (placeholder implementation)
/// Send notification for status change
pub async fn send_status_change_notification(
&mut self,
status_change: StatusChange,
mut status_change: StatusChange,
metric: &cm_dashboard_shared::Metric,
) -> Result<(), anyhow::Error> {
if !self.config.enabled {
return Ok(());
}
// Check rate limiting
if self.is_rate_limited(&status_change.metric_name) {
debug!("Notification rate limited for {}", status_change.metric_name);
// Only notify on transitions to warning/critical, or recovery to ok
let should_send = match (status_change.old_status, status_change.new_status) {
(_, Status::Warning) | (_, Status::Critical) => true,
(Status::Warning | Status::Critical, Status::Ok) => true,
_ => false,
};
if !should_send {
return Ok(());
}
// Check maintenance mode
if self.is_maintenance_mode() {
debug!("Maintenance mode active, suppressing notification for {}", status_change.metric_name);
debug!(
"Maintenance mode active, suppressing notification for {}",
status_change.metric_name
);
return Ok(());
}
info!("Would send notification for {}: {:?} -> {:?}",
status_change.metric_name, status_change.old_status, status_change.new_status);
// TODO: Implement actual email sending using lettre
// For now, just log the notification
self.log_notification(&status_change, metric);
// Update last notification time
self.last_notification_times.insert(
status_change.metric_name.clone(),
status_change.timestamp
);
// Add metric details to status change
status_change.details = Some(self.format_metric_details(metric));
// For recovery notifications, include original problem details
if status_change.new_status == Status::Ok &&
(status_change.old_status == Status::Warning || status_change.old_status == Status::Critical) {
if let Some(old_details) = self.metric_details.get(&status_change.metric_name) {
status_change.details = Some(format!(
"Recovered from: {}\nCurrent status: {}",
old_details,
status_change.details.unwrap_or_default()
));
}
// Clear stored details after recovery
self.metric_details.remove(&status_change.metric_name);
} else if status_change.new_status == Status::Warning || status_change.new_status == Status::Critical {
// Store details for warning/critical states
if let Some(ref details) = status_change.details {
self.metric_details.insert(status_change.metric_name.clone(), details.clone());
}
}
// Save status after updating details
self.save_status();
// Send the actual email
if let Err(e) = self.send_email(&status_change).await {
error!("Failed to send notification email: {}", e);
} else {
info!(
"Sent notification: {} {:?} → {:?}",
status_change.metric_name, status_change.old_status, status_change.new_status
);
}
Ok(())
}
/// Check if maintenance mode is active
fn is_maintenance_mode(&self) -> bool {
std::fs::metadata("/tmp/cm-maintenance").is_ok()
}
/// Check if notification is rate limited
fn is_rate_limited(&self, metric_name: &str) -> bool {
if self.config.rate_limit_minutes == 0 {
return false; // No rate limiting
}
if let Some(last_time) = self.last_notification_times.get(metric_name) {
let elapsed = last_time.elapsed();
let rate_limit_duration = std::time::Duration::from_secs(self.config.rate_limit_minutes * 60);
elapsed < rate_limit_duration
} else {
false // No previous notification
}
/// Format metric details for notification
fn format_metric_details(&self, metric: &cm_dashboard_shared::Metric) -> String {
format!("Value: {}", metric.value.as_string())
}
/// Log notification details
fn log_notification(&self, status_change: &StatusChange, metric: &cm_dashboard_shared::Metric) {
let status_description = match status_change.new_status {
Status::Ok => "recovered",
Status::Warning => "warning",
Status::Critical => "critical",
Status::Unknown => "unknown",
/// Format email subject
fn format_subject(&self, change: &StatusChange) -> String {
let urgency = match change.new_status {
Status::Critical => "🔴 CRITICAL",
Status::Warning => "🟡 WARNING",
Status::Ok => "✅ RESOLVED",
Status::Unknown => " STATUS",
};
info!(
"NOTIFICATION: {} on {}: {} is {} (value: {})",
status_description,
self.hostname,
status_change.metric_name,
status_description,
metric.value.as_string()
);
format!("{}: {} on {}", urgency, change.metric_name, self.hostname)
}
/// Format email body
fn format_body(&self, change: &StatusChange) -> String {
let mut body = format!(
"Status Change Alert\n\
\n\
Host: {}\n\
Metric: {}\n\
Status Change: {:?}{:?}\n\
Time: {}",
self.hostname,
change.metric_name,
change.old_status,
change.new_status,
change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
);
if let Some(details) = &change.details {
body.push_str(&format!("\n\nDetails:\n{}", details));
}
body.push_str(&format!(
"\n\n--\n\
CM Dashboard Agent\n\
Generated at {}",
Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
));
body
}
/// Send email notification
async fn send_email(&self, change: &StatusChange) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let subject = self.format_subject(change);
let body = self.format_body(change);
// Replace {hostname} placeholder in from_email
let from_email = self.config.from_email.replace("{hostname}", &self.hostname);
let email = Message::builder()
.from(from_email.parse()?)
.to(self.config.to_email.parse()?)
.subject(subject)
.body(body)?;
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
.port(self.config.smtp_port)
.build();
mailer.send(&email)?;
Ok(())
}
/// Process any pending notifications (placeholder)
pub async fn process_pending(&mut self) {
// Placeholder for batch notification processing
// Could be used for email queue processing, etc.
}
/// Load status from disk
fn load_status(file_path: &str) -> (HashMap<String, Status>, HashMap<String, String>) {
match fs::read_to_string(file_path) {
Ok(content) => {
match serde_json::from_str::<PersistedStatus>(&content) {
Ok(persisted) => {
info!("Loaded {} metric statuses from {}", persisted.metric_statuses.len(), file_path);
(persisted.metric_statuses, persisted.metric_details)
}
Err(e) => {
warn!("Failed to parse status file {}: {}", file_path, e);
(HashMap::new(), HashMap::new())
}
}
}
Err(_) => {
info!("No previous status file found at {}, starting fresh", file_path);
(HashMap::new(), HashMap::new())
}
}
}
/// Save status to disk
fn save_status(&self) {
let persisted = PersistedStatus {
metric_statuses: self.metric_statuses.clone(),
metric_details: self.metric_details.clone(),
};
match serde_json::to_string_pretty(&persisted) {
Ok(content) => {
if let Err(e) = fs::write(&self.status_file_path, content) {
warn!("Failed to save status to {}: {}", self.status_file_path, e);
}
}
Err(e) => {
warn!("Failed to serialize status: {}", e);
}
}
}
/// Get current metric statuses
pub fn get_metric_statuses(&self) -> &HashMap<String, Status> {
&self.metric_statuses
}
}
}