diff --git a/Cargo.lock b/Cargo.lock index 9535699..903463c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.61" +version = "0.1.62" dependencies = [ "anyhow", "chrono", @@ -292,7 +292,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.61" +version = "0.1.62" dependencies = [ "anyhow", "async-trait", @@ -315,7 +315,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.61" +version = "0.1.62" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index c7f4a5a..404368b 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.62" +version = "0.1.63" edition = "2021" [dependencies] diff --git a/agent/src/agent.rs b/agent/src/agent.rs index e1b74a7..d0ef87d 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -78,10 +78,11 @@ impl Agent { info!("Initial metric collection completed - all data cached and ready"); } - // Separate intervals for collection, transmission, and email notifications + // Separate intervals for collection, transmission, heartbeat, and email notifications let mut collection_interval = interval(Duration::from_secs(self.config.collection_interval_seconds)); let mut transmission_interval = interval(Duration::from_secs(self.config.zmq.transmission_interval_seconds)); + let mut heartbeat_interval = interval(Duration::from_secs(self.config.zmq.heartbeat_interval_seconds)); let mut notification_interval = interval(Duration::from_secs(self.config.notifications.aggregation_interval_seconds)); loop { @@ -98,6 +99,12 @@ impl Agent { error!("Failed to broadcast metrics: {}", e); } } + _ = heartbeat_interval.tick() => { + // Send standalone heartbeat for host connectivity detection + if let Err(e) = self.send_heartbeat().await { + error!("Failed to send heartbeat: {}", e); + } + } _ = notification_interval.tick() => { // Process batched email notifications (separate from dashboard updates) if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await { @@ -206,7 +213,7 @@ impl Agent { let mut status_changed = false; for metric in metrics { // Filter excluded metrics from email notification processing only - if self.config.exclude_email_metrics.contains(&metric.name) { + if self.config.notifications.exclude_email_metrics.contains(&metric.name) { debug!("Excluding metric '{}' from email notification processing", metric.name); continue; } @@ -252,6 +259,19 @@ impl Agent { ) } + /// Send standalone heartbeat for connectivity detection + async fn send_heartbeat(&mut self) -> Result<()> { + let heartbeat_metric = self.get_heartbeat_metric(); + let message = MetricMessage::new( + self.hostname.clone(), + vec![heartbeat_metric], + ); + + self.zmq_handler.publish_metrics(&message).await?; + debug!("Sent standalone heartbeat for connectivity detection"); + Ok(()) + } + async fn handle_commands(&mut self) -> Result<()> { // Try to receive commands (non-blocking) match self.zmq_handler.try_receive_command() { diff --git a/agent/src/config/mod.rs b/agent/src/config/mod.rs index cccf4f9..51880e5 100644 --- a/agent/src/config/mod.rs +++ b/agent/src/config/mod.rs @@ -17,9 +17,6 @@ pub struct AgentConfig { pub notifications: NotificationConfig, pub status_aggregation: HostStatusConfig, pub collection_interval_seconds: u64, - /// List of metric names to exclude from email notifications - #[serde(default)] - pub exclude_email_metrics: Vec, } /// ZMQ communication configuration @@ -29,6 +26,9 @@ pub struct ZmqConfig { pub command_port: u16, pub bind_address: String, pub transmission_interval_seconds: u64, + /// Heartbeat transmission interval in seconds for host connectivity detection + #[serde(default = "default_heartbeat_interval_seconds")] + pub heartbeat_interval_seconds: u64, } /// Collector configuration @@ -147,9 +147,16 @@ pub struct NotificationConfig { pub rate_limit_minutes: u64, /// Email notification batching interval in seconds (default: 60) pub aggregation_interval_seconds: u64, + /// List of metric names to exclude from email notifications + #[serde(default)] + pub exclude_email_metrics: Vec, } +fn default_heartbeat_interval_seconds() -> u64 { + 5 +} + impl AgentConfig { pub fn from_file>(path: P) -> Result { loader::load_config(path) diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 878f8f0..cebb463 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.62" +version = "0.1.63" edition = "2021" [dependencies] diff --git a/dashboard/src/app.rs b/dashboard/src/app.rs index 1ae525b..92cc74f 100644 --- a/dashboard/src/app.rs +++ b/dashboard/src/app.rs @@ -278,11 +278,13 @@ impl Dashboard { // Check for host connectivity changes (heartbeat timeouts) periodically if last_heartbeat_check.elapsed() >= heartbeat_check_interval { + let timeout = Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds); + + // Clean up metrics for offline hosts + self.metric_store.cleanup_offline_hosts(timeout); + if let Some(ref mut tui_app) = self.tui_app { - let connected_hosts = self - .metric_store - .get_connected_hosts(Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds)); - + let connected_hosts = self.metric_store.get_connected_hosts(timeout); tui_app.update_hosts(connected_hosts); } last_heartbeat_check = Instant::now(); diff --git a/dashboard/src/metrics/store.rs b/dashboard/src/metrics/store.rs index 053e8f1..1ebc8b6 100644 --- a/dashboard/src/metrics/store.rs +++ b/dashboard/src/metrics/store.rs @@ -109,6 +109,28 @@ impl MetricStore { .collect() } + /// Clean up data for offline hosts + pub fn cleanup_offline_hosts(&mut self, timeout: Duration) { + let now = Instant::now(); + let mut hosts_to_cleanup = Vec::new(); + + // Find hosts that are offline (no recent heartbeat) + for (hostname, &last_heartbeat) in &self.last_heartbeat { + if now.duration_since(last_heartbeat) > timeout { + hosts_to_cleanup.push(hostname.clone()); + } + } + + // Clear metrics for offline hosts + for hostname in hosts_to_cleanup { + if let Some(metrics) = self.current_metrics.remove(&hostname) { + info!("Cleared {} metrics for offline host: {}", metrics.len(), hostname); + } + // Keep heartbeat timestamp for reconnection detection + // Don't remove from last_heartbeat to track when host was last seen + } + } + /// Cleanup old data and enforce limits fn cleanup_host_data(&mut self, hostname: &str) { let now = Instant::now(); diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 367c9b8..bd41f89 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.62" +version = "0.1.63" edition = "2021" [dependencies]