Implement heartbeat-based host connectivity detection
All checks were successful
Build and Release / build-and-release (push) Successful in 2m8s
All checks were successful
Build and Release / build-and-release (push) Successful in 2m8s
- Add agent_heartbeat metric to agent transmission for reliable host detection - Update dashboard to track heartbeat timestamps per host instead of general metrics - Add configurable heartbeat_timeout_seconds to dashboard ZMQ config (default 10s) - Remove unused timeout_ms from agent config and revert to non-blocking command reception - Remove unused heartbeat_interval_ms from agent configuration - Host disconnect detection now uses dedicated heartbeat metrics for improved reliability - Bump version to 0.1.57
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.56"
|
||||
version = "0.1.57"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -22,7 +22,7 @@ pub struct Dashboard {
|
||||
terminal: Option<Terminal<CrosstermBackend<io::Stdout>>>,
|
||||
headless: bool,
|
||||
initial_commands_sent: std::collections::HashSet<String>,
|
||||
_config: DashboardConfig,
|
||||
config: DashboardConfig,
|
||||
}
|
||||
|
||||
impl Dashboard {
|
||||
@@ -133,7 +133,7 @@ impl Dashboard {
|
||||
terminal,
|
||||
headless,
|
||||
initial_commands_sent: std::collections::HashSet::new(),
|
||||
_config: config,
|
||||
config,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -247,7 +247,7 @@ impl Dashboard {
|
||||
if let Some(ref mut tui_app) = self.tui_app {
|
||||
let connected_hosts = self
|
||||
.metric_store
|
||||
.get_connected_hosts(Duration::from_secs(30));
|
||||
.get_connected_hosts(Duration::from_secs(self.config.zmq.heartbeat_timeout_seconds));
|
||||
|
||||
|
||||
tui_app.update_hosts(connected_hosts);
|
||||
|
||||
@@ -141,9 +141,9 @@ impl ZmqConsumer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Receive metrics from any connected agent (non-blocking)
|
||||
/// Receive metrics from any connected agent (with timeout)
|
||||
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
|
||||
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
|
||||
match self.subscriber.recv_bytes(0) {
|
||||
Ok(data) => {
|
||||
debug!("Received {} bytes from ZMQ", data.len());
|
||||
|
||||
|
||||
@@ -16,6 +16,13 @@ pub struct DashboardConfig {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ZmqConfig {
|
||||
pub subscriber_ports: Vec<u16>,
|
||||
/// Heartbeat timeout in seconds - hosts considered offline if no heartbeat received within this time
|
||||
#[serde(default = "default_heartbeat_timeout_seconds")]
|
||||
pub heartbeat_timeout_seconds: u64,
|
||||
}
|
||||
|
||||
fn default_heartbeat_timeout_seconds() -> u64 {
|
||||
10 // Default to 10 seconds - allows for multiple missed heartbeats
|
||||
}
|
||||
|
||||
/// Individual host configuration details
|
||||
|
||||
@@ -11,8 +11,8 @@ pub struct MetricStore {
|
||||
current_metrics: HashMap<String, HashMap<String, Metric>>,
|
||||
/// Historical metrics for trending
|
||||
historical_metrics: HashMap<String, Vec<MetricDataPoint>>,
|
||||
/// Last update timestamp per host
|
||||
last_update: HashMap<String, Instant>,
|
||||
/// Last heartbeat timestamp per host
|
||||
last_heartbeat: HashMap<String, Instant>,
|
||||
/// Configuration
|
||||
max_metrics_per_host: usize,
|
||||
history_retention: Duration,
|
||||
@@ -23,7 +23,7 @@ impl MetricStore {
|
||||
Self {
|
||||
current_metrics: HashMap::new(),
|
||||
historical_metrics: HashMap::new(),
|
||||
last_update: HashMap::new(),
|
||||
last_heartbeat: HashMap::new(),
|
||||
max_metrics_per_host,
|
||||
history_retention: Duration::from_secs(history_retention_hours * 3600),
|
||||
}
|
||||
@@ -56,10 +56,13 @@ impl MetricStore {
|
||||
|
||||
// Add to history
|
||||
host_history.push(MetricDataPoint { received_at: now });
|
||||
}
|
||||
|
||||
// Update last update timestamp
|
||||
self.last_update.insert(hostname.to_string(), now);
|
||||
// Track heartbeat metrics for connectivity detection
|
||||
if metric_name == "agent_heartbeat" {
|
||||
self.last_heartbeat.insert(hostname.to_string(), now);
|
||||
debug!("Updated heartbeat for host {}", hostname);
|
||||
}
|
||||
}
|
||||
|
||||
// Get metrics count before cleanup
|
||||
let metrics_count = host_metrics.len();
|
||||
@@ -88,16 +91,18 @@ impl MetricStore {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get connected hosts (hosts with recent updates)
|
||||
/// Get connected hosts (hosts with recent heartbeats)
|
||||
pub fn get_connected_hosts(&self, timeout: Duration) -> Vec<String> {
|
||||
let now = Instant::now();
|
||||
|
||||
self.last_update
|
||||
self.last_heartbeat
|
||||
.iter()
|
||||
.filter_map(|(hostname, &last_update)| {
|
||||
if now.duration_since(last_update) <= timeout {
|
||||
.filter_map(|(hostname, &last_heartbeat)| {
|
||||
if now.duration_since(last_heartbeat) <= timeout {
|
||||
Some(hostname.clone())
|
||||
} else {
|
||||
debug!("Host {} considered offline - last heartbeat was {:?} ago",
|
||||
hostname, now.duration_since(last_heartbeat));
|
||||
None
|
||||
}
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user