Implement heartbeat-based host connectivity detection
All checks were successful
Build and Release / build-and-release (push) Successful in 2m8s

- Add agent_heartbeat metric to agent transmission for reliable host detection
- Update dashboard to track heartbeat timestamps per host instead of general metrics
- Add configurable heartbeat_timeout_seconds to dashboard ZMQ config (default 10s)
- Remove unused timeout_ms from agent config and revert to non-blocking command reception
- Remove unused heartbeat_interval_ms from agent configuration
- Host disconnect detection now uses dedicated heartbeat metrics for improved reliability
- Bump version to 0.1.57
This commit is contained in:
2025-11-06 11:04:01 +01:00
parent 0e7cf24dbb
commit 5f6e47ece5
12 changed files with 52 additions and 29 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "cm-dashboard"
version = "0.1.56"
version = "0.1.57"
edition = "2021"
[dependencies]

View File

@@ -22,7 +22,7 @@ pub struct Dashboard {
terminal: Option<Terminal<CrosstermBackend<io::Stdout>>>,
headless: bool,
initial_commands_sent: std::collections::HashSet<String>,
_config: DashboardConfig,
config: DashboardConfig,
}
impl Dashboard {
@@ -133,7 +133,7 @@ impl Dashboard {
terminal,
headless,
initial_commands_sent: std::collections::HashSet::new(),
_config: config,
config,
})
}
@@ -247,7 +247,7 @@ impl Dashboard {
if let Some(ref mut tui_app) = self.tui_app {
let connected_hosts = self
.metric_store
.get_connected_hosts(Duration::from_secs(30));
.get_connected_hosts(Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds));
tui_app.update_hosts(connected_hosts);

View File

@@ -141,9 +141,9 @@ impl ZmqConsumer {
}
}
/// Receive metrics from any connected agent (non-blocking)
/// Receive metrics from any connected agent (with timeout)
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
match self.subscriber.recv_bytes(0) {
Ok(data) => {
debug!("Received {} bytes from ZMQ", data.len());

View File

@@ -16,6 +16,13 @@ pub struct DashboardConfig {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ZmqConfig {
pub subscriber_ports: Vec<u16>,
/// Heartbeat timeout in seconds - hosts considered offline if no heartbeat received within this time
#[serde(default = "default_heartbeat_timeout_seconds")]
pub heartbeat_timeout_seconds: u64,
}
fn default_heartbeat_timeout_seconds() -> u64 {
10 // Default to 10 seconds - allows for multiple missed heartbeats
}
/// Individual host configuration details

View File

@@ -11,8 +11,8 @@ pub struct MetricStore {
current_metrics: HashMap<String, HashMap<String, Metric>>,
/// Historical metrics for trending
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
/// Last update timestamp per host
last_update: HashMap<String, Instant>,
/// Last heartbeat timestamp per host
last_heartbeat: HashMap<String, Instant>,
/// Configuration
max_metrics_per_host: usize,
history_retention: Duration,
@@ -23,7 +23,7 @@ impl MetricStore {
Self {
current_metrics: HashMap::new(),
historical_metrics: HashMap::new(),
last_update: HashMap::new(),
last_heartbeat: HashMap::new(),
max_metrics_per_host,
history_retention: Duration::from_secs(history_retention_hours * 3600),
}
@@ -56,10 +56,13 @@ impl MetricStore {
// Add to history
host_history.push(MetricDataPoint { received_at: now });
}
// Update last update timestamp
self.last_update.insert(hostname.to_string(), now);
// Track heartbeat metrics for connectivity detection
if metric_name == "agent_heartbeat" {
self.last_heartbeat.insert(hostname.to_string(), now);
debug!("Updated heartbeat for host {}", hostname);
}
}
// Get metrics count before cleanup
let metrics_count = host_metrics.len();
@@ -88,16 +91,18 @@ impl MetricStore {
}
}
/// Get connected hosts (hosts with recent updates)
/// Get connected hosts (hosts with recent heartbeats)
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
let now = Instant::now();
self.last_update
self.last_heartbeat
.iter()
.filter_map(|(hostname, &last_update)| {
if now.duration_since(last_update) <= timeout {
.filter_map(|(hostname, &last_heartbeat)| {
if now.duration_since(last_heartbeat) <= timeout {
Some(hostname.clone())
} else {
debug!("Host {} considered offline - last heartbeat was {:?} ago",
hostname, now.duration_since(last_heartbeat));
None
}
})