Improve host disconnection detection and fix notification exclusions
All checks were successful
Build and Release / build-and-release (push) Successful in 1m34s
All checks were successful
Build and Release / build-and-release (push) Successful in 1m34s
- Add dedicated heartbeat transmission every 5 seconds independent of metric collection - Fix host offline detection by clearing metrics for disconnected hosts - Move exclude_email_metrics to NotificationConfig for better organization - Add cleanup_offline_hosts method to remove stale metrics after heartbeat timeout - Ensure offline hosts show proper status icons and visual indicators Version 0.1.63
This commit is contained in:
parent
c980346d05
commit
0faed9309e
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.61"
|
||||
version = "0.1.62"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@ -292,7 +292,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.61"
|
||||
version = "0.1.62"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@ -315,7 +315,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.61"
|
||||
version = "0.1.62"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.62"
|
||||
version = "0.1.63"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@ -78,10 +78,11 @@ impl Agent {
|
||||
info!("Initial metric collection completed - all data cached and ready");
|
||||
}
|
||||
|
||||
// Separate intervals for collection, transmission, and email notifications
|
||||
// Separate intervals for collection, transmission, heartbeat, and email notifications
|
||||
let mut collection_interval =
|
||||
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
||||
let mut transmission_interval = interval(Duration::from_secs(self.config.zmq.transmission_interval_seconds));
|
||||
let mut heartbeat_interval = interval(Duration::from_secs(self.config.zmq.heartbeat_interval_seconds));
|
||||
let mut notification_interval = interval(Duration::from_secs(self.config.notifications.aggregation_interval_seconds));
|
||||
|
||||
loop {
|
||||
@ -98,6 +99,12 @@ impl Agent {
|
||||
error!("Failed to broadcast metrics: {}", e);
|
||||
}
|
||||
}
|
||||
_ = heartbeat_interval.tick() => {
|
||||
// Send standalone heartbeat for host connectivity detection
|
||||
if let Err(e) = self.send_heartbeat().await {
|
||||
error!("Failed to send heartbeat: {}", e);
|
||||
}
|
||||
}
|
||||
_ = notification_interval.tick() => {
|
||||
// Process batched email notifications (separate from dashboard updates)
|
||||
if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await {
|
||||
@ -206,7 +213,7 @@ impl Agent {
|
||||
let mut status_changed = false;
|
||||
for metric in metrics {
|
||||
// Filter excluded metrics from email notification processing only
|
||||
if self.config.exclude_email_metrics.contains(&metric.name) {
|
||||
if self.config.notifications.exclude_email_metrics.contains(&metric.name) {
|
||||
debug!("Excluding metric '{}' from email notification processing", metric.name);
|
||||
continue;
|
||||
}
|
||||
@ -252,6 +259,19 @@ impl Agent {
|
||||
)
|
||||
}
|
||||
|
||||
/// Send standalone heartbeat for connectivity detection
|
||||
async fn send_heartbeat(&mut self) -> Result<()> {
|
||||
let heartbeat_metric = self.get_heartbeat_metric();
|
||||
let message = MetricMessage::new(
|
||||
self.hostname.clone(),
|
||||
vec![heartbeat_metric],
|
||||
);
|
||||
|
||||
self.zmq_handler.publish_metrics(&message).await?;
|
||||
debug!("Sent standalone heartbeat for connectivity detection");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_commands(&mut self) -> Result<()> {
|
||||
// Try to receive commands (non-blocking)
|
||||
match self.zmq_handler.try_receive_command() {
|
||||
|
||||
@ -17,9 +17,6 @@ pub struct AgentConfig {
|
||||
pub notifications: NotificationConfig,
|
||||
pub status_aggregation: HostStatusConfig,
|
||||
pub collection_interval_seconds: u64,
|
||||
/// List of metric names to exclude from email notifications
|
||||
#[serde(default)]
|
||||
pub exclude_email_metrics: Vec<String>,
|
||||
}
|
||||
|
||||
/// ZMQ communication configuration
|
||||
@ -29,6 +26,9 @@ pub struct ZmqConfig {
|
||||
pub command_port: u16,
|
||||
pub bind_address: String,
|
||||
pub transmission_interval_seconds: u64,
|
||||
/// Heartbeat transmission interval in seconds for host connectivity detection
|
||||
#[serde(default = "default_heartbeat_interval_seconds")]
|
||||
pub heartbeat_interval_seconds: u64,
|
||||
}
|
||||
|
||||
/// Collector configuration
|
||||
@ -147,9 +147,16 @@ pub struct NotificationConfig {
|
||||
pub rate_limit_minutes: u64,
|
||||
/// Email notification batching interval in seconds (default: 60)
|
||||
pub aggregation_interval_seconds: u64,
|
||||
/// List of metric names to exclude from email notifications
|
||||
#[serde(default)]
|
||||
pub exclude_email_metrics: Vec<String>,
|
||||
}
|
||||
|
||||
|
||||
fn default_heartbeat_interval_seconds() -> u64 {
|
||||
5
|
||||
}
|
||||
|
||||
impl AgentConfig {
|
||||
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
loader::load_config(path)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.62"
|
||||
version = "0.1.63"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@ -278,11 +278,13 @@ impl Dashboard {
|
||||
|
||||
// Check for host connectivity changes (heartbeat timeouts) periodically
|
||||
if last_heartbeat_check.elapsed() >= heartbeat_check_interval {
|
||||
let timeout = Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds);
|
||||
|
||||
// Clean up metrics for offline hosts
|
||||
self.metric_store.cleanup_offline_hosts(timeout);
|
||||
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
let connected_hosts = self
|
||||
.metric_store
|
||||
.get_connected_hosts(Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds));
|
||||
|
||||
let connected_hosts = self.metric_store.get_connected_hosts(timeout);
|
||||
tui_app.update_hosts(connected_hosts);
|
||||
}
|
||||
last_heartbeat_check = Instant::now();
|
||||
|
||||
@ -109,6 +109,28 @@ impl MetricStore {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Clean up data for offline hosts
|
||||
pub fn cleanup_offline_hosts(&mut self, timeout: Duration) {
|
||||
let now = Instant::now();
|
||||
let mut hosts_to_cleanup = Vec::new();
|
||||
|
||||
// Find hosts that are offline (no recent heartbeat)
|
||||
for (hostname, &last_heartbeat) in &self.last_heartbeat {
|
||||
if now.duration_since(last_heartbeat) > timeout {
|
||||
hosts_to_cleanup.push(hostname.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Clear metrics for offline hosts
|
||||
for hostname in hosts_to_cleanup {
|
||||
if let Some(metrics) = self.current_metrics.remove(&hostname) {
|
||||
info!("Cleared {} metrics for offline host: {}", metrics.len(), hostname);
|
||||
}
|
||||
// Keep heartbeat timestamp for reconnection detection
|
||||
// Don't remove from last_heartbeat to track when host was last seen
|
||||
}
|
||||
}
|
||||
|
||||
/// Cleanup old data and enforce limits
|
||||
fn cleanup_host_data(&mut self, hostname: &str) {
|
||||
let now = Instant::now();
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.62"
|
||||
version = "0.1.63"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user