Implement comprehensive monitoring improvements

- Add full email notifications with lettre and Stockholm timezone
- Add status persistence to prevent notification spam on restart
- Change nginx monitoring to check backend proxy_pass URLs instead of frontend domains
- Increase nginx site timeout to 10 seconds for backend health checks
- Fix cache intervals: disk (5min), backup (10min), systemd (30s), cpu/memory (5s)
- Remove rate limiting for immediate notifications on all status changes
- Store metric status in /var/lib/cm-dashboard/last-status.json
This commit is contained in:
Christoffer Martinsson 2025-10-20 14:32:44 +02:00
parent ecaf3aedb5
commit 66a79574e0
5 changed files with 260 additions and 95 deletions

View File

@ -17,6 +17,7 @@ tracing = { workspace = true }
tracing-subscriber = { workspace = true }
lettre = { workspace = true }
gethostname = { workspace = true }
chrono-tz = "0.8"
toml = { workspace = true }
async-trait = "0.1"
reqwest = { version = "0.11", features = ["json", "blocking"] }

View File

@ -595,8 +595,8 @@ impl SystemdCollector {
// Create HTTP client with timeouts (similar to legacy implementation)
let client = reqwest::blocking::Client::builder()
.timeout(Duration::from_secs(5))
.connect_timeout(Duration::from_secs(2))
.timeout(Duration::from_secs(10))
.connect_timeout(Duration::from_secs(10))
.redirect(reqwest::redirect::Policy::limited(10))
.build()?;
@ -739,12 +739,9 @@ impl SystemdCollector {
while i < lines.len() {
let line = lines[i].trim();
if line.starts_with("server") && line.contains("{") {
debug!("Found server block at line {}", i);
if let Some(server_name) = self.parse_server_block(&lines, &mut i) {
debug!("Extracted server name: {}", server_name);
let url = format!("https://{}", server_name);
// Use the full domain as the site name for clarity
sites.push((server_name.clone(), url));
if let Some(proxy_url) = self.parse_server_block(&lines, &mut i) {
let site_name = proxy_url.replace("http://", "").replace("https://", "");
sites.push((site_name, proxy_url));
}
}
i += 1;
@ -758,6 +755,7 @@ impl SystemdCollector {
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
use tracing::debug;
let mut server_names = Vec::new();
let mut proxy_pass_url = None;
let mut has_redirect = false;
let mut i = *start_index + 1;
let mut brace_count = 1;
@ -787,6 +785,17 @@ impl SystemdCollector {
}
}
// Extract proxy_pass URL (backend IP:port)
if trimmed.starts_with("proxy_pass") {
if let Some(url_part) = trimmed.strip_prefix("proxy_pass") {
let url_clean = url_part.trim().trim_end_matches(';');
if !url_clean.is_empty() {
proxy_pass_url = Some(url_clean.to_string());
debug!("Found proxy_pass in block: {}", url_clean);
}
}
}
// Check for redirects (skip redirect-only servers)
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
has_redirect = true;
@ -797,11 +806,12 @@ impl SystemdCollector {
*start_index = i - 1;
// Only return hostnames that are not redirects and have actual content
if !server_names.is_empty() && !has_redirect {
Some(server_names[0].clone())
} else {
None
if let Some(proxy_url) = proxy_pass_url {
if !has_redirect {
return Some(proxy_url);
}
}
None
}
}

View File

@ -1,4 +1,4 @@
// Collection intervals
// Collection intervals
pub const DEFAULT_COLLECTION_INTERVAL_SECONDS: u64 = 2;
pub const DEFAULT_CPU_INTERVAL_SECONDS: u64 = 5;
pub const DEFAULT_MEMORY_INTERVAL_SECONDS: u64 = 5;
@ -46,10 +46,9 @@ pub const DEFAULT_SMART_WEAR_CRITICAL: f32 = 90.0;
// Backup configuration
pub const DEFAULT_BACKUP_MAX_AGE_HOURS: u64 = 48;
// Notification configuration (from legacy)
pub const DEFAULT_SMTP_HOST: &str = "localhost";
pub const DEFAULT_SMTP_PORT: u16 = 25;
pub const DEFAULT_FROM_EMAIL: &str = "{hostname}@cmtec.se";
pub const DEFAULT_TO_EMAIL: &str = "cm@cmtec.se";
pub const DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES: u64 = 30;
pub const DEFAULT_NOTIFICATION_RATE_LIMIT_MINUTES: u64 = 0;

View File

@ -182,11 +182,13 @@ impl MetricCollectionManager {
for collector in &self.collectors {
let collector_name = collector.name();
// Determine cache interval for this collector type - ALL REALTIME FOR FAST UPDATES
// Determine cache interval for this collector type based on data volatility
let cache_interval_secs = match collector_name {
"cpu" | "memory" | "disk" | "systemd" => 2, // All realtime for fast updates
"backup" => 10, // Backup metrics every 10 seconds for testing
_ => 2, // All realtime for fast updates
"cpu" | "memory" => 5, // Fast updates for volatile metrics
"systemd" => 30, // Service status changes less frequently
"disk" => 300, // SMART data changes very slowly (5 minutes)
"backup" => 600, // Backup status changes rarely (10 minutes)
_ => 30, // Default: moderate frequency
};
let should_collect =

View File

@ -1,16 +1,29 @@
use cm_dashboard_shared::Status;
use std::collections::HashMap;
use std::time::Instant;
use tracing::{info, debug};
use std::fs;
use std::path::Path;
use tracing::{debug, info, error, warn};
use chrono::{DateTime, Utc};
use chrono_tz::Europe::Stockholm;
use lettre::{Message, SmtpTransport, Transport};
use serde::{Serialize, Deserialize};
use crate::config::NotificationConfig;
/// Persisted status data
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PersistedStatus {
metric_statuses: HashMap<String, Status>,
metric_details: HashMap<String, String>,
}
/// Manages status change tracking and notifications
pub struct NotificationManager {
config: NotificationConfig,
hostname: String,
metric_statuses: HashMap<String, Status>,
last_notification_times: HashMap<String, Instant>,
metric_details: HashMap<String, String>, // Store details for warning/critical states
status_file_path: String,
}
/// Status change information
@ -19,129 +32,269 @@ pub struct StatusChange {
pub metric_name: String,
pub old_status: Status,
pub new_status: Status,
pub timestamp: Instant,
pub timestamp: DateTime<Utc>,
pub details: Option<String>,
}
impl NotificationManager {
pub fn new(config: &NotificationConfig, hostname: &str) -> Result<Self, anyhow::Error> {
info!("Initializing notification manager for {}", hostname);
let status_file_path = "/var/lib/cm-dashboard/last-status.json".to_string();
// Create directory if it doesn't exist
if let Some(parent) = Path::new(&status_file_path).parent() {
if let Err(e) = fs::create_dir_all(parent) {
warn!("Failed to create status directory {}: {}", parent.display(), e);
}
}
// Load previous status from disk
let (metric_statuses, metric_details) = Self::load_status(&status_file_path);
Ok(Self {
config: config.clone(),
hostname: hostname.to_string(),
metric_statuses: HashMap::new(),
last_notification_times: HashMap::new(),
metric_statuses,
metric_details,
status_file_path,
})
}
/// Update metric status and return status change if any
pub fn update_metric_status(&mut self, metric_name: &str, new_status: Status) -> Option<StatusChange> {
let old_status = self.metric_statuses.get(metric_name).copied().unwrap_or(Status::Unknown);
// Update stored status
self.metric_statuses.insert(metric_name.to_string(), new_status);
pub fn update_metric_status(
&mut self,
metric_name: &str,
new_status: Status,
) -> Option<StatusChange> {
let old_status = self
.metric_statuses
.get(metric_name)
.copied()
.unwrap_or(Status::Unknown);
// Check if status actually changed
if old_status != new_status {
debug!("Status change detected for {}: {:?} -> {:?}", metric_name, old_status, new_status);
// Update stored status only on change
self.metric_statuses
.insert(metric_name.to_string(), new_status);
// Save status to disk only when status changes
self.save_status();
debug!(
"Status change detected for {}: {:?} -> {:?}",
metric_name, old_status, new_status
);
Some(StatusChange {
metric_name: metric_name.to_string(),
old_status,
new_status,
timestamp: Instant::now(),
timestamp: Utc::now(),
details: None, // Will be populated when needed
})
} else {
// No status change - update stored status but don't save to disk
self.metric_statuses
.insert(metric_name.to_string(), new_status);
None
}
}
/// Send notification for status change (placeholder implementation)
/// Send notification for status change
pub async fn send_status_change_notification(
&mut self,
status_change: StatusChange,
mut status_change: StatusChange,
metric: &cm_dashboard_shared::Metric,
) -> Result<(), anyhow::Error> {
if !self.config.enabled {
return Ok(());
}
// Check rate limiting
if self.is_rate_limited(&status_change.metric_name) {
debug!("Notification rate limited for {}", status_change.metric_name);
// Only notify on transitions to warning/critical, or recovery to ok
let should_send = match (status_change.old_status, status_change.new_status) {
(_, Status::Warning) | (_, Status::Critical) => true,
(Status::Warning | Status::Critical, Status::Ok) => true,
_ => false,
};
if !should_send {
return Ok(());
}
// Check maintenance mode
if self.is_maintenance_mode() {
debug!("Maintenance mode active, suppressing notification for {}", status_change.metric_name);
debug!(
"Maintenance mode active, suppressing notification for {}",
status_change.metric_name
);
return Ok(());
}
info!("Would send notification for {}: {:?} -> {:?}",
status_change.metric_name, status_change.old_status, status_change.new_status);
// TODO: Implement actual email sending using lettre
// For now, just log the notification
self.log_notification(&status_change, metric);
// Update last notification time
self.last_notification_times.insert(
status_change.metric_name.clone(),
status_change.timestamp
);
// Add metric details to status change
status_change.details = Some(self.format_metric_details(metric));
// For recovery notifications, include original problem details
if status_change.new_status == Status::Ok &&
(status_change.old_status == Status::Warning || status_change.old_status == Status::Critical) {
if let Some(old_details) = self.metric_details.get(&status_change.metric_name) {
status_change.details = Some(format!(
"Recovered from: {}\nCurrent status: {}",
old_details,
status_change.details.unwrap_or_default()
));
}
// Clear stored details after recovery
self.metric_details.remove(&status_change.metric_name);
} else if status_change.new_status == Status::Warning || status_change.new_status == Status::Critical {
// Store details for warning/critical states
if let Some(ref details) = status_change.details {
self.metric_details.insert(status_change.metric_name.clone(), details.clone());
}
}
// Save status after updating details
self.save_status();
// Send the actual email
if let Err(e) = self.send_email(&status_change).await {
error!("Failed to send notification email: {}", e);
} else {
info!(
"Sent notification: {} {:?} → {:?}",
status_change.metric_name, status_change.old_status, status_change.new_status
);
}
Ok(())
}
/// Check if maintenance mode is active
fn is_maintenance_mode(&self) -> bool {
std::fs::metadata("/tmp/cm-maintenance").is_ok()
}
/// Check if notification is rate limited
fn is_rate_limited(&self, metric_name: &str) -> bool {
if self.config.rate_limit_minutes == 0 {
return false; // No rate limiting
}
if let Some(last_time) = self.last_notification_times.get(metric_name) {
let elapsed = last_time.elapsed();
let rate_limit_duration = std::time::Duration::from_secs(self.config.rate_limit_minutes * 60);
elapsed < rate_limit_duration
} else {
false // No previous notification
}
/// Format metric details for notification
fn format_metric_details(&self, metric: &cm_dashboard_shared::Metric) -> String {
format!("Value: {}", metric.value.as_string())
}
/// Log notification details
fn log_notification(&self, status_change: &StatusChange, metric: &cm_dashboard_shared::Metric) {
let status_description = match status_change.new_status {
Status::Ok => "recovered",
Status::Warning => "warning",
Status::Critical => "critical",
Status::Unknown => "unknown",
/// Format email subject
fn format_subject(&self, change: &StatusChange) -> String {
let urgency = match change.new_status {
Status::Critical => "🔴 CRITICAL",
Status::Warning => "🟡 WARNING",
Status::Ok => "✅ RESOLVED",
Status::Unknown => " STATUS",
};
info!(
"NOTIFICATION: {} on {}: {} is {} (value: {})",
status_description,
self.hostname,
status_change.metric_name,
status_description,
metric.value.as_string()
);
format!("{}: {} on {}", urgency, change.metric_name, self.hostname)
}
/// Format email body
fn format_body(&self, change: &StatusChange) -> String {
let mut body = format!(
"Status Change Alert\n\
\n\
Host: {}\n\
Metric: {}\n\
Status Change: {:?} {:?}\n\
Time: {}",
self.hostname,
change.metric_name,
change.old_status,
change.new_status,
change.timestamp.with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
);
if let Some(details) = &change.details {
body.push_str(&format!("\n\nDetails:\n{}", details));
}
body.push_str(&format!(
"\n\n--\n\
CM Dashboard Agent\n\
Generated at {}",
Utc::now().with_timezone(&Stockholm).format("%Y-%m-%d %H:%M:%S CET/CEST")
));
body
}
/// Send email notification
async fn send_email(&self, change: &StatusChange) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let subject = self.format_subject(change);
let body = self.format_body(change);
// Replace {hostname} placeholder in from_email
let from_email = self.config.from_email.replace("{hostname}", &self.hostname);
let email = Message::builder()
.from(from_email.parse()?)
.to(self.config.to_email.parse()?)
.subject(subject)
.body(body)?;
let mailer = SmtpTransport::builder_dangerous(&self.config.smtp_host)
.port(self.config.smtp_port)
.build();
mailer.send(&email)?;
Ok(())
}
/// Process any pending notifications (placeholder)
pub async fn process_pending(&mut self) {
// Placeholder for batch notification processing
// Could be used for email queue processing, etc.
}
/// Load status from disk
fn load_status(file_path: &str) -> (HashMap<String, Status>, HashMap<String, String>) {
match fs::read_to_string(file_path) {
Ok(content) => {
match serde_json::from_str::<PersistedStatus>(&content) {
Ok(persisted) => {
info!("Loaded {} metric statuses from {}", persisted.metric_statuses.len(), file_path);
(persisted.metric_statuses, persisted.metric_details)
}
Err(e) => {
warn!("Failed to parse status file {}: {}", file_path, e);
(HashMap::new(), HashMap::new())
}
}
}
Err(_) => {
info!("No previous status file found at {}, starting fresh", file_path);
(HashMap::new(), HashMap::new())
}
}
}
/// Save status to disk
fn save_status(&self) {
let persisted = PersistedStatus {
metric_statuses: self.metric_statuses.clone(),
metric_details: self.metric_details.clone(),
};
match serde_json::to_string_pretty(&persisted) {
Ok(content) => {
if let Err(e) = fs::write(&self.status_file_path, content) {
warn!("Failed to save status to {}: {}", self.status_file_path, e);
}
}
Err(e) => {
warn!("Failed to serialize status: {}", e);
}
}
}
/// Get current metric statuses
pub fn get_metric_statuses(&self) -> &HashMap<String, Status> {
&self.metric_statuses
}
}
}