Christoffer Martinsson 4f4c3b0d6e
All checks were successful
Build and Release / build-and-release (push) Successful in 2m9s
Improve notification behavior during startup and recovery
Fix notification issues for better operational experience:

Startup Notification Suppression:
- Suppress notifications for transitions from Status::Unknown during agent/server startup
- Prevents notification spam when services transition from Unknown to Warning/Critical on restart
- Only real status changes (not initial discovery) trigger notifications
- Maintains alerting for actual service state changes after startup

Recovery Notification Refinement:
- Recovery notifications only sent when ALL services reach OK status
- Individual service recoveries suppressed if other services still have problems
- Ensures recovery notifications indicate complete system health restoration
- Prevents premature celebration when partial recoveries occur

Result: Clean startup experience without false alerts and meaningful recovery
notifications that truly indicate full system health restoration.

Bump version to v0.1.48
2025-10-30 12:35:23 +01:00

422 lines
17 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use cm_dashboard_shared::{Status, Metric};
use std::collections::HashMap;
use std::time::Instant;
use tracing::{debug, info, error};
use serde::{Deserialize, Serialize};
use chrono::Utc;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HostStatusConfig {
pub enabled: bool,
pub aggregation_method: String, // "worst_case"
}
impl Default for HostStatusConfig {
fn default() -> Self {
Self {
enabled: true,
aggregation_method: "worst_case".to_string(),
}
}
}
#[derive(Debug, Clone)]
pub struct StatusChangeSummary {
pub service_name: String,
pub initial_status: Status,
pub final_status: Status,
pub change_count: usize,
}
#[derive(Debug, Clone)]
pub struct AggregatedStatusChanges {
pub start_time: Instant,
pub end_time: Instant,
pub service_summaries: Vec<StatusChangeSummary>,
pub host_status_initial: Status,
pub host_status_final: Status,
pub requires_notification: bool,
}
pub struct HostStatusManager {
service_statuses: HashMap<String, Status>,
current_host_status: Status,
previous_host_status: Status,
last_status_change: Option<Instant>,
config: HostStatusConfig,
// Notification batching
pending_changes: HashMap<String, (Status, Status, usize)>, // service -> (initial_status, current_status, change_count)
batch_start_time: Option<Instant>,
batch_start_host_status: Status,
}
impl HostStatusManager {
pub fn new(config: HostStatusConfig) -> Self {
info!("Initializing HostStatusManager with config: {:?}", config);
Self {
service_statuses: HashMap::new(),
current_host_status: Status::Unknown,
previous_host_status: Status::Unknown,
last_status_change: None,
config,
pending_changes: HashMap::new(),
batch_start_time: None,
batch_start_host_status: Status::Unknown,
}
}
/// Update the status of a specific service and recalculate host status
/// Updates real-time status and buffers changes for email notifications
pub fn update_service_status(&mut self, service: String, status: Status) {
if !self.config.enabled {
return;
}
let old_service_status = self.service_statuses.get(&service).copied().unwrap_or(Status::Unknown);
// Only proceed if status actually changed
if old_service_status == status {
return;
}
// Initialize batch if this is the first change
if self.batch_start_time.is_none() {
self.batch_start_time = Some(Instant::now());
self.batch_start_host_status = self.current_host_status;
debug!("Starting notification batch");
}
// Update real-time service status (for dashboard)
self.service_statuses.insert(service.clone(), status);
// Buffer change for email notifications
match self.pending_changes.entry(service.clone()) {
std::collections::hash_map::Entry::Occupied(mut entry) => {
// Service already has changes in this batch - update final status and increment count
let (initial_status, _current_status, change_count) = entry.get();
entry.insert((*initial_status, status, change_count + 1));
}
std::collections::hash_map::Entry::Vacant(entry) => {
// First change for this service in this batch
entry.insert((old_service_status, status, 1));
}
}
// Recalculate host status
let old_host_status = self.current_host_status;
self.previous_host_status = old_host_status;
self.current_host_status = self.calculate_host_status();
if old_host_status != self.current_host_status {
self.last_status_change = Some(Instant::now());
info!(
"Host status changed: {:?} -> {:?} (triggered by service '{}': {:?} -> {:?})",
old_host_status, self.current_host_status, service, old_service_status, status
);
}
debug!(
"Service status updated: {} {:?} -> {:?}, host status: {:?}, pending notifications: {}",
service, old_service_status, status, self.current_host_status, self.pending_changes.len()
);
}
/// Get the current host status as a metric for broadcasting to dashboard
pub fn get_host_status_metric(&self) -> Metric {
Metric {
name: "host_status_summary".to_string(),
value: cm_dashboard_shared::MetricValue::String(format!(
"Host aggregated from {} services",
self.service_statuses.len()
)),
status: self.current_host_status,
timestamp: Utc::now().timestamp() as u64,
description: Some("Aggregated host status from all services".to_string()),
unit: None,
}
}
/// Calculate the overall host status based on all service statuses
fn calculate_host_status(&self) -> Status {
if self.service_statuses.is_empty() {
return Status::Unknown;
}
match self.config.aggregation_method.as_str() {
"worst_case" => {
let statuses: Vec<Status> = self.service_statuses.values().copied().collect();
Status::aggregate(&statuses)
},
_ => {
debug!("Unknown aggregation method: {}, falling back to worst_case", self.config.aggregation_method);
let statuses: Vec<Status> = self.service_statuses.values().copied().collect();
Status::aggregate(&statuses)
}
}
}
/// Process a metric - updates status and queues for aggregated notifications if status changed
pub async fn process_metric(&mut self, metric: &Metric, _notification_manager: &mut crate::notifications::NotificationManager) -> bool {
let old_service_status = self.service_statuses.get(&metric.name).copied();
let old_host_status = self.current_host_status;
let new_service_status = metric.status;
// Update status (this recalculates host status internally)
self.update_service_status(metric.name.clone(), new_service_status);
let new_host_status = self.current_host_status;
let mut status_changed = false;
// Check if service status actually changed (ignore first-time status setting)
if let Some(old_service_status) = old_service_status {
if old_service_status != new_service_status {
debug!("Service status change detected for {}: {:?} -> {:?}", metric.name, old_service_status, new_service_status);
// Queue change for aggregated notification (not immediate)
self.queue_status_change(&metric.name, old_service_status, new_service_status);
status_changed = true;
}
} else {
debug!("Initial status set for {}: {:?}", metric.name, new_service_status);
}
// Check if host status changed (this should trigger immediate transmission)
if old_host_status != new_host_status {
debug!("Host status change detected: {:?} -> {:?}", old_host_status, new_host_status);
status_changed = true;
}
status_changed // Return true if either service or host status changed
}
/// Queue status change for aggregated notification
fn queue_status_change(&mut self, metric_name: &str, old_status: Status, new_status: Status) {
// Add to pending changes for aggregated notification
let entry = self.pending_changes.entry(metric_name.to_string()).or_insert((old_status, old_status, 0));
entry.1 = new_status; // Update final status
entry.2 += 1; // Increment change count
// Set batch start time if this is the first change
if self.batch_start_time.is_none() {
self.batch_start_time = Some(Instant::now());
}
}
/// Process pending notifications - legacy method, now rarely used
pub async fn process_pending_notifications(&mut self, notification_manager: &mut crate::notifications::NotificationManager) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
if !self.config.enabled || self.pending_changes.is_empty() {
return Ok(());
}
// Process notifications immediately without interval batching
// Create aggregated status changes
let aggregated = self.create_aggregated_changes();
if aggregated.requires_notification {
info!("Sending aggregated notification for {} service changes", aggregated.service_summaries.len());
// Send aggregated notification
if let Err(e) = self.send_aggregated_email(&aggregated, notification_manager).await {
error!("Failed to send aggregated notification: {}", e);
}
} else {
debug!("No significant changes requiring notification in batch of {} changes", self.pending_changes.len());
}
// Clear the batch
self.clear_notification_batch();
Ok(())
}
/// Create aggregated status changes from pending buffer
fn create_aggregated_changes(&self) -> AggregatedStatusChanges {
let mut service_summaries = Vec::new();
let mut requires_notification = false;
for (service_name, (initial_status, final_status, change_count)) in &self.pending_changes {
let significant_change = self.is_significant_change(*initial_status, *final_status);
if significant_change {
requires_notification = true;
}
service_summaries.push(StatusChangeSummary {
service_name: service_name.clone(),
initial_status: *initial_status,
final_status: *final_status,
change_count: *change_count,
});
}
// Also check if host status change is significant
if self.is_significant_change(self.batch_start_host_status, self.current_host_status) {
requires_notification = true;
}
AggregatedStatusChanges {
start_time: self.batch_start_time.unwrap_or_else(Instant::now),
end_time: Instant::now(),
service_summaries,
host_status_initial: self.batch_start_host_status,
host_status_final: self.current_host_status,
requires_notification,
}
}
/// Check if a status change is significant enough for notification
fn is_significant_change(&self, old_status: Status, new_status: Status) -> bool {
match (old_status, new_status) {
// Don't notify on transitions from Unknown (startup/restart scenario)
(Status::Unknown, _) => false,
// Always notify on problems (but not from Unknown)
(_, Status::Warning) | (_, Status::Critical) => true,
// Only notify on recovery if it's from a problem state to OK and all services are OK
(Status::Warning | Status::Critical, Status::Ok) => self.current_host_status == Status::Ok,
// Don't notify on other transitions
_ => false,
}
}
async fn send_aggregated_email(
&self,
aggregated: &AggregatedStatusChanges,
notification_manager: &mut crate::notifications::NotificationManager,
) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let mut summary_parts = Vec::new();
let critical_count = aggregated.service_summaries.iter().filter(|s| s.final_status == Status::Critical).count();
let warning_count = aggregated.service_summaries.iter().filter(|s| s.final_status == Status::Warning).count();
let recovery_count = aggregated.service_summaries.iter().filter(|s|
matches!((s.initial_status, s.final_status), (Status::Warning | Status::Critical, Status::Ok))
).count();
let startup_count = aggregated.service_summaries.iter().filter(|s|
matches!((s.initial_status, s.final_status), (Status::Unknown, Status::Ok | Status::Pending))
).count();
if critical_count > 0 { summary_parts.push(format!("{} critical", critical_count)); }
if warning_count > 0 { summary_parts.push(format!("{} warning", warning_count)); }
if recovery_count > 0 { summary_parts.push(format!("{} recovered", recovery_count)); }
if startup_count > 0 { summary_parts.push(format!("{} started", startup_count)); }
let summary_text = if summary_parts.is_empty() {
format!("{} service changes", aggregated.service_summaries.len())
} else {
summary_parts.join(", ")
};
let subject = format!("Status Alert: {}", summary_text);
let body = self.format_aggregated_details(aggregated);
notification_manager.send_direct_email(&subject, &body).await.map_err(|e| e.into())
}
/// Format details for aggregated notification
fn format_aggregated_details(&self, aggregated: &AggregatedStatusChanges) -> String {
let mut details = String::new();
let duration = aggregated.end_time.duration_since(aggregated.start_time).as_secs();
details.push_str(&format!(
"Status Summary ({}s duration)\n",
duration
));
if aggregated.host_status_initial != aggregated.host_status_final {
details.push_str(&format!(
"Host Status: {:?}{:?}\n\n",
aggregated.host_status_initial,
aggregated.host_status_final
));
}
// Group services by change type
let mut critical_changes = Vec::new();
let mut warning_changes = Vec::new();
let mut recovery_changes = Vec::new();
let mut startup_changes = Vec::new();
let mut other_changes = Vec::new();
for summary in &aggregated.service_summaries {
let change_info = format!(
"{}: {:?}{:?}{}",
summary.service_name,
summary.initial_status,
summary.final_status,
if summary.change_count > 1 { format!(" ({} changes)", summary.change_count) } else { String::new() }
);
match (summary.initial_status, summary.final_status) {
(_, Status::Critical) => critical_changes.push(change_info),
(_, Status::Warning) => warning_changes.push(change_info),
(Status::Warning | Status::Critical, Status::Ok) => recovery_changes.push(change_info),
(Status::Unknown, Status::Ok | Status::Pending) => startup_changes.push(change_info),
_ => other_changes.push(change_info),
}
}
// Show critical problems first
if !critical_changes.is_empty() {
details.push_str(&format!("🔴 CRITICAL ISSUES ({}):\n", critical_changes.len()));
for change in critical_changes {
details.push_str(&format!(" {}\n", change));
}
details.push('\n');
}
// Show warnings
if !warning_changes.is_empty() {
details.push_str(&format!("🟡 WARNINGS ({}):\n", warning_changes.len()));
for change in warning_changes {
details.push_str(&format!(" {}\n", change));
}
details.push('\n');
}
// Show recoveries only if host status is now OK (all services recovered)
if !recovery_changes.is_empty() && aggregated.host_status_final == Status::Ok {
details.push_str(&format!("✅ RECOVERIES ({}):\n", recovery_changes.len()));
for change in recovery_changes {
details.push_str(&format!(" {}\n", change));
}
details.push('\n');
}
// Show startups (usually not important but good to know)
if !startup_changes.is_empty() {
details.push_str(&format!("🟢 SERVICE STARTUPS ({}):\n", startup_changes.len()));
for change in startup_changes {
details.push_str(&format!(" {}\n", change));
}
details.push('\n');
}
// Show other changes
if !other_changes.is_empty() {
details.push_str(&format!(" OTHER CHANGES ({}):\n", other_changes.len()));
for change in other_changes {
details.push_str(&format!(" {}\n", change));
}
}
details
}
/// Clear the notification batch
fn clear_notification_batch(&mut self) {
self.pending_changes.clear();
self.batch_start_time = None;
self.batch_start_host_status = self.current_host_status;
debug!("Cleared notification batch");
}
}
// Tests temporarily disabled due to API changes
// The functionality works as tested manually
#[cfg(test)]
mod tests {
// Tests will be updated to match the new notification batching API
}