Update to v0.1.19 with event-driven status aggregation
All checks were successful
Build and Release / build-and-release (push) Successful in 2m4s
All checks were successful
Build and Release / build-and-release (push) Successful in 2m4s
Major architectural improvements: CORE CHANGES: - Remove notification_interval_seconds - status aggregation now immediate - Status calculation moved to collection phase instead of transmission - Event-driven transmission triggers immediately on status changes - Dual transmission strategy: immediate on change + periodic backup - Real-time notifications without batching delays TECHNICAL IMPROVEMENTS: - process_metric() now returns bool indicating status change - Immediate ZMQ broadcast when status changes detected - Status aggregation happens during metric collection, not later - Legacy get_nixos_build_info() method removed (unused) - All compilation warnings fixed BEHAVIOR CHANGES: - Critical alerts sent instantly instead of waiting for intervals - Dashboard receives real-time status updates - Notifications triggered immediately on status transitions - Backup periodic transmission every 1s ensures heartbeat This provides much more responsive monitoring with instant alerting while maintaining the reliability of periodic transmission as backup.
This commit is contained in:
parent
627c533724
commit
91f037aa3e
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.18"
|
||||
version = "0.1.19"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@ -291,7 +291,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.18"
|
||||
version = "0.1.19"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@ -314,7 +314,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.18"
|
||||
version = "0.1.19"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.18"
|
||||
version = "0.1.19"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@ -75,7 +75,6 @@ impl Agent {
|
||||
let mut collection_interval =
|
||||
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
||||
let mut transmission_interval = interval(Duration::from_secs(self.config.zmq.transmission_interval_seconds));
|
||||
let mut notification_interval = interval(Duration::from_secs(self.config.status_aggregation.notification_interval_seconds));
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@ -86,13 +85,12 @@ impl Agent {
|
||||
}
|
||||
}
|
||||
_ = transmission_interval.tick() => {
|
||||
// Send all metrics via ZMQ every 1 second
|
||||
// Send all metrics via ZMQ and process notifications immediately
|
||||
if let Err(e) = self.broadcast_all_metrics().await {
|
||||
error!("Failed to broadcast metrics: {}", e);
|
||||
}
|
||||
}
|
||||
_ = notification_interval.tick() => {
|
||||
// Process batched notifications
|
||||
|
||||
// Process notifications immediately with each transmission
|
||||
if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await {
|
||||
error!("Failed to process pending notifications: {}", e);
|
||||
}
|
||||
@ -127,8 +125,8 @@ impl Agent {
|
||||
|
||||
info!("Force collected and cached {} metrics", metrics.len());
|
||||
|
||||
// Process metrics through status manager
|
||||
self.process_metrics(&metrics).await;
|
||||
// Process metrics through status manager (don't trigger transmission on startup)
|
||||
let _status_changed = self.process_metrics(&metrics).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -146,8 +144,15 @@ impl Agent {
|
||||
|
||||
debug!("Collected and cached {} metrics", metrics.len());
|
||||
|
||||
// Process metrics through status manager
|
||||
self.process_metrics(&metrics).await;
|
||||
// Process metrics through status manager and trigger immediate transmission if status changed
|
||||
let status_changed = self.process_metrics(&metrics).await;
|
||||
|
||||
if status_changed {
|
||||
info!("Status change detected - triggering immediate metric transmission");
|
||||
if let Err(e) = self.broadcast_all_metrics().await {
|
||||
error!("Failed to broadcast metrics after status change: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -181,11 +186,15 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_metrics(&mut self, metrics: &[Metric]) {
|
||||
async fn process_metrics(&mut self, metrics: &[Metric]) -> bool {
|
||||
let mut status_changed = false;
|
||||
for metric in metrics {
|
||||
self.host_status_manager.process_metric(metric, &mut self.notification_manager).await;
|
||||
if self.host_status_manager.process_metric(metric, &mut self.notification_manager).await {
|
||||
status_changed = true;
|
||||
}
|
||||
}
|
||||
status_changed
|
||||
}
|
||||
|
||||
/// Create agent version metric for cross-host version comparison
|
||||
fn get_agent_version_metric(&self) -> Metric {
|
||||
|
||||
@ -19,31 +19,6 @@ impl NixOSCollector {
|
||||
Self {}
|
||||
}
|
||||
|
||||
/// Get NixOS build information
|
||||
fn get_nixos_build_info(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// Get nixos-version output directly
|
||||
let output = Command::new("nixos-version").output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err("nixos-version command failed".into());
|
||||
}
|
||||
|
||||
let version_line = String::from_utf8_lossy(&output.stdout);
|
||||
let version = version_line.trim();
|
||||
|
||||
if version.is_empty() {
|
||||
return Err("Empty nixos-version output".into());
|
||||
}
|
||||
|
||||
// Remove codename part (e.g., "(Warbler)")
|
||||
let clean_version = if let Some(pos) = version.find(" (") {
|
||||
version[..pos].to_string()
|
||||
} else {
|
||||
version.to_string()
|
||||
};
|
||||
|
||||
Ok(clean_version)
|
||||
}
|
||||
|
||||
/// Get agent hash from binary path
|
||||
fn get_agent_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
|
||||
@ -9,7 +9,6 @@ use chrono::Utc;
|
||||
pub struct HostStatusConfig {
|
||||
pub enabled: bool,
|
||||
pub aggregation_method: String, // "worst_case"
|
||||
pub notification_interval_seconds: u64,
|
||||
}
|
||||
|
||||
impl Default for HostStatusConfig {
|
||||
@ -17,7 +16,6 @@ impl Default for HostStatusConfig {
|
||||
Self {
|
||||
enabled: true,
|
||||
aggregation_method: "worst_case".to_string(),
|
||||
notification_interval_seconds: 30,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -160,25 +158,69 @@ impl HostStatusManager {
|
||||
|
||||
|
||||
|
||||
/// Process a metric - updates status (notifications handled separately via batching)
|
||||
pub async fn process_metric(&mut self, metric: &Metric, _notification_manager: &mut crate::notifications::NotificationManager) {
|
||||
// Just update status - notifications are handled by process_pending_notifications
|
||||
self.update_service_status(metric.name.clone(), metric.status);
|
||||
/// Process a metric - updates status and triggers immediate notifications if status changed
|
||||
pub async fn process_metric(&mut self, metric: &Metric, notification_manager: &mut crate::notifications::NotificationManager) -> bool {
|
||||
let old_status = self.service_statuses.get(&metric.name).copied().unwrap_or(Status::Unknown);
|
||||
let new_status = metric.status;
|
||||
|
||||
// Update status
|
||||
self.update_service_status(metric.name.clone(), new_status);
|
||||
|
||||
// Check if status actually changed
|
||||
if old_status != new_status {
|
||||
debug!("Status change detected for {}: {:?} -> {:?}", metric.name, old_status, new_status);
|
||||
|
||||
// Process notification immediately on status change
|
||||
if let Err(e) = self.process_status_change(&metric.name, old_status, new_status, notification_manager).await {
|
||||
error!("Failed to process status change notification: {}", e);
|
||||
}
|
||||
|
||||
/// Process pending notifications - call this at notification intervals
|
||||
return true; // Status changed - caller should trigger immediate transmission
|
||||
}
|
||||
|
||||
false // No status change
|
||||
}
|
||||
|
||||
/// Process immediate status change notification
|
||||
async fn process_status_change(&mut self, metric_name: &str, old_status: Status, new_status: Status, notification_manager: &mut crate::notifications::NotificationManager) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
if !self.config.enabled {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Create immediate notification for this specific status change
|
||||
let status_summary = StatusChangeSummary {
|
||||
service_name: metric_name.to_string(),
|
||||
initial_status: old_status,
|
||||
final_status: new_status,
|
||||
change_count: 1,
|
||||
};
|
||||
|
||||
let aggregated = AggregatedStatusChanges {
|
||||
start_time: Instant::now(),
|
||||
end_time: Instant::now(),
|
||||
service_summaries: vec![status_summary],
|
||||
host_status_initial: old_status,
|
||||
host_status_final: new_status,
|
||||
requires_notification: true,
|
||||
};
|
||||
|
||||
// Send immediate notification using existing method
|
||||
if let Err(e) = self.send_aggregated_email(&aggregated, notification_manager).await {
|
||||
error!("Failed to send immediate notification: {}", e);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
info!("Sent immediate notification for {} status change: {:?} -> {:?}", metric_name, old_status, new_status);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Process pending notifications - legacy method, now rarely used
|
||||
pub async fn process_pending_notifications(&mut self, notification_manager: &mut crate::notifications::NotificationManager) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
if !self.config.enabled || self.pending_changes.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let batch_start = self.batch_start_time.unwrap_or_else(Instant::now);
|
||||
let batch_duration = batch_start.elapsed();
|
||||
|
||||
// Only process if enough time has passed
|
||||
if batch_duration.as_secs() < self.config.notification_interval_seconds {
|
||||
return Ok(());
|
||||
}
|
||||
// Process notifications immediately without interval batching
|
||||
|
||||
// Create aggregated status changes
|
||||
let aggregated = self.create_aggregated_changes();
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.18"
|
||||
version = "0.1.19"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@ -14,7 +14,7 @@ use app::Dashboard;
|
||||
|
||||
/// Get hardcoded version
|
||||
fn get_version() -> &'static str {
|
||||
"v0.1.18"
|
||||
"v0.1.19"
|
||||
}
|
||||
|
||||
/// Check if running inside tmux session
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.18"
|
||||
version = "0.1.19"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user