From f635ba9c750663310232229ee4ae69bfee1f02b9 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Sat, 15 Nov 2025 10:04:47 +0100 Subject: [PATCH] Remove Tailscale and connection type complexity Simplifies host connection configuration by removing tailscale_ip field, connection_type preferences, and fallback retry logic. Now uses only the ip field or hostname as fallback. Eliminates blocking TCP connectivity tests that interfered with heartbeat processing. This resolves intermittent host lost/found issues by removing the connection retry timeouts that blocked the ZMQ message processing loop. --- Cargo.lock | 6 +-- agent/Cargo.toml | 2 +- agent/src/agent.rs | 58 +++++++++++++------------- dashboard/Cargo.toml | 2 +- dashboard/src/communication/mod.rs | 48 +++++++++++++--------- dashboard/src/config/mod.rs | 65 +----------------------------- shared/Cargo.toml | 2 +- 7 files changed, 69 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d3f8c35..856d124 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.68" +version = "0.1.69" dependencies = [ "anyhow", "chrono", @@ -292,7 +292,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.68" +version = "0.1.69" dependencies = [ "anyhow", "async-trait", @@ -315,7 +315,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.68" +version = "0.1.69" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 4c5a37c..9b9c3f6 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.69" +version = "0.1.70" edition = "2021" [dependencies] diff --git a/agent/src/agent.rs b/agent/src/agent.rs index d0ef87d..0ba4422 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -351,36 +351,40 @@ impl Agent { _ => {} } - let output = tokio::process::Command::new("sudo") - .arg("systemctl") - .arg(action_str) - .arg(format!("{}.service", service_name)) - .output() - .await?; + // Spawn the systemctl command asynchronously to avoid blocking the agent + let service_name_clone = service_name.to_string(); + let action_str_clone = action_str.to_string(); + + tokio::spawn(async move { + let result = tokio::process::Command::new("sudo") + .arg("systemctl") + .arg(&action_str_clone) + .arg(format!("{}.service", service_name_clone)) + .output() + .await; - if output.status.success() { - info!("Service {} {} completed successfully", service_name, action_str); - if !output.stdout.is_empty() { - debug!("stdout: {}", String::from_utf8_lossy(&output.stdout)); + match result { + Ok(output) => { + if output.status.success() { + info!("Service {} {} completed successfully", service_name_clone, action_str_clone); + if !output.stdout.is_empty() { + debug!("stdout: {}", String::from_utf8_lossy(&output.stdout)); + } + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + error!("Service {} {} failed: {}", service_name_clone, action_str_clone, stderr); + } + } + Err(e) => { + error!("Failed to execute systemctl {} {}: {}", action_str_clone, service_name_clone, e); + } } - - // Note: User-stopped flag will be cleared by systemd collector - // when service actually reaches 'active' state, not here - } else { - let stderr = String::from_utf8_lossy(&output.stderr); - error!("Service {} {} failed: {}", service_name, action_str, stderr); - return Err(anyhow::anyhow!("systemctl {} {} failed: {}", action_str, service_name, stderr)); - } + }); - // Force refresh metrics after service control to update service status - if matches!(action, ServiceAction::Start | ServiceAction::Stop | ServiceAction::UserStart | ServiceAction::UserStop) { - info!("Triggering immediate metric refresh after service control"); - if let Err(e) = self.collect_metrics_only().await { - error!("Failed to refresh metrics after service control: {}", e); - } else { - info!("Service status refreshed immediately after {} {}", action_str, service_name); - } - } + info!("Service {} {} command initiated (non-blocking)", service_name, action_str); + + // Note: Service status will be updated by the normal metric collection cycle + // once the systemctl operation completes Ok(()) } diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index bd2777a..57ed479 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.69" +version = "0.1.70" edition = "2021" [dependencies] diff --git a/dashboard/src/communication/mod.rs b/dashboard/src/communication/mod.rs index 3643853..99e63bc 100644 --- a/dashboard/src/communication/mod.rs +++ b/dashboard/src/communication/mod.rs @@ -71,6 +71,12 @@ impl ZmqConsumer { pub async fn connect_to_host(&mut self, hostname: &str, port: u16) -> Result<()> { let address = format!("tcp://{}:{}", hostname, port); + // First test basic TCP connectivity to the port + if let Err(e) = self.test_tcp_connectivity(hostname, port).await { + error!("TCP connectivity test failed for {}: {}", address, e); + return Err(e); + } + match self.subscriber.connect(&address) { Ok(()) => { info!("Connected to agent at {}", address); @@ -84,6 +90,26 @@ impl ZmqConsumer { } } + /// Test TCP connectivity to a host and port with timeout + async fn test_tcp_connectivity(&self, hostname: &str, port: u16) -> Result<()> { + let timeout = std::time::Duration::from_secs(3); + + match tokio::time::timeout(timeout, tokio::net::TcpStream::connect((hostname, port))).await { + Ok(Ok(_stream)) => { + debug!("TCP connectivity test passed for {}:{}", hostname, port); + Ok(()) + } + Ok(Err(e)) => { + debug!("TCP connectivity test failed for {}:{}: {}", hostname, port, e); + Err(anyhow::anyhow!("TCP connection failed: {}", e)) + } + Err(_) => { + debug!("TCP connectivity test timed out for {}:{}", hostname, port); + Err(anyhow::anyhow!("TCP connection timed out")) + } + } + } + /// Connect to predefined hosts using their configuration pub async fn connect_to_predefined_hosts(&mut self, hosts: &std::collections::HashMap) -> Result<()> { let default_port = self.config.subscriber_ports[0]; @@ -104,27 +130,13 @@ impl ZmqConsumer { Ok(()) } - /// Connect to a host using its configuration details with fallback support + /// Connect to a host using its configuration details pub async fn connect_to_host_with_details(&mut self, hostname: &str, host_details: &crate::config::HostDetails, port: u16) -> Result<()> { - // Get primary connection IP + // Get primary connection IP only - no fallbacks let primary_ip = host_details.get_connection_ip(hostname); - // Try primary connection - if let Ok(()) = self.connect_to_host(&primary_ip, port).await { - info!("Connected to {} via primary address: {}", hostname, primary_ip); - return Ok(()); - } - - // Try fallback IPs if primary fails - let fallbacks = host_details.get_fallback_ips(hostname); - for fallback_ip in fallbacks { - if let Ok(()) = self.connect_to_host(&fallback_ip, port).await { - info!("Connected to {} via fallback address: {}", hostname, fallback_ip); - return Ok(()); - } - } - - Err(anyhow::anyhow!("Failed to connect to {} using all available addresses", hostname)) + // Connect directly without fallback attempts + self.connect_to_host(&primary_ip, port).await } /// Receive command output from any connected agent (non-blocking) diff --git a/dashboard/src/config/mod.rs b/dashboard/src/config/mod.rs index 3c255b7..cfd6a8d 100644 --- a/dashboard/src/config/mod.rs +++ b/dashboard/src/config/mod.rs @@ -31,76 +31,15 @@ pub struct HostDetails { pub mac_address: Option, /// Primary IP address (local network) pub ip: Option, - /// Tailscale network IP address - pub tailscale_ip: Option, - /// Preferred connection type: "local", "tailscale", or "auto" (fallback) - #[serde(default = "default_connection_type")] - pub connection_type: String, } -fn default_connection_type() -> String { - "auto".to_string() -} impl HostDetails { - /// Get the preferred IP address for connection based on connection_type + /// Get the IP address for connection (uses ip field or hostname as fallback) pub fn get_connection_ip(&self, hostname: &str) -> String { - match self.connection_type.as_str() { - "tailscale" => { - if let Some(ref ts_ip) = self.tailscale_ip { - ts_ip.clone() - } else { - // Fallback to local IP or hostname - self.ip.as_ref().unwrap_or(&hostname.to_string()).clone() - } - } - "local" => { - if let Some(ref local_ip) = self.ip { - local_ip.clone() - } else { - hostname.to_string() - } - } - "auto" | _ => { - // Try local first, then tailscale, then hostname - if let Some(ref local_ip) = self.ip { - local_ip.clone() - } else if let Some(ref ts_ip) = self.tailscale_ip { - ts_ip.clone() - } else { - hostname.to_string() - } - } - } + self.ip.as_ref().unwrap_or(&hostname.to_string()).clone() } - /// Get fallback IP addresses for connection retry - pub fn get_fallback_ips(&self, hostname: &str) -> Vec { - let mut fallbacks = Vec::new(); - - // Add all available IPs except the primary one - let primary = self.get_connection_ip(hostname); - - // Add fallbacks in priority order: local first, then tailscale - if let Some(ref local_ip) = self.ip { - if local_ip != &primary { - fallbacks.push(local_ip.clone()); - } - } - - if let Some(ref ts_ip) = self.tailscale_ip { - if ts_ip != &primary { - fallbacks.push(ts_ip.clone()); - } - } - - // Always include hostname as final fallback if not already primary - if hostname != primary { - fallbacks.push(hostname.to_string()); - } - - fallbacks - } } /// System configuration diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 0af1508..a550b5e 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.69" +version = "0.1.70" edition = "2021" [dependencies]