From 374b1264462fd35db0a60589b840ee8a7a806038 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 16:38:54 +0100 Subject: [PATCH] Reduce all command timeouts to 2-3 seconds max MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 10-second host heartbeat timeout, all command timeouts must be significantly lower to ensure total collection time stays under 10 seconds. Changed timeouts: - smartctl: 10s → 3s (critical: multiple drives queried sequentially) - du: 5s → 2s - lsblk: 5s → 2s - systemctl list commands: 5s → 3s - systemctl show/is-active: 3s → 2s - docker commands: 5s → 3s - df, ip commands: 3s → 2s Total worst-case collection time now capped at more reasonable levels, preventing false host offline alerts from blocking operations. --- Cargo.lock | 6 +++--- agent/src/collectors/disk.rs | 6 +++--- agent/src/collectors/memory.rs | 4 ++-- agent/src/collectors/network.rs | 2 +- agent/src/collectors/systemd.rs | 34 ++++++++++++++++----------------- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 30876be..3dd5607 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.186" +version = "0.1.187" dependencies = [ "anyhow", "chrono", @@ -301,7 +301,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.186" +version = "0.1.187" dependencies = [ "anyhow", "async-trait", @@ -324,7 +324,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.186" +version = "0.1.187" dependencies = [ "chrono", "serde", diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index 3167f4e..71c53cf 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -117,7 +117,7 @@ impl DiskCollector { let mut cmd = Command::new("lsblk"); cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]); - let output = run_command_with_timeout(cmd, 5).await + let output = run_command_with_timeout(cmd, 2).await .map_err(|e| CollectorError::SystemRead { path: "block devices".to_string(), error: e.to_string(), @@ -427,7 +427,7 @@ impl DiskCollector { cmd.args(&["-a", &format!("/dev/{}", drive_name)]); } - let output = run_command_with_timeout(cmd, 10).await + let output = run_command_with_timeout(cmd, 3).await .map_err(|e| CollectorError::SystemRead { path: format!("SMART data for {}", drive_name), error: e.to_string(), @@ -764,7 +764,7 @@ impl DiskCollector { fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result { // Use lsblk to find the backing device with timeout let output = Command::new("timeout") - .args(&["5", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"]) + .args(&["2", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"]) .output() .map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?; diff --git a/agent/src/collectors/memory.rs b/agent/src/collectors/memory.rs index 2f3e86c..e186704 100644 --- a/agent/src/collectors/memory.rs +++ b/agent/src/collectors/memory.rs @@ -105,8 +105,8 @@ impl MemoryCollector { return Ok(()); } - // Get usage data for all tmpfs mounts at once using df (with 3 second timeout) - let mut df_args = vec!["3", "df", "--output=target,size,used", "--block-size=1"]; + // Get usage data for all tmpfs mounts at once using df (with 2 second timeout) + let mut df_args = vec!["2", "df", "--output=target,size,used", "--block-size=1"]; df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str())); let df_output = std::process::Command::new("timeout") diff --git a/agent/src/collectors/network.rs b/agent/src/collectors/network.rs index 5286143..fd4dbe2 100644 --- a/agent/src/collectors/network.rs +++ b/agent/src/collectors/network.rs @@ -110,7 +110,7 @@ impl NetworkCollector { // Parse VLAN configuration let vlan_map = Self::parse_vlan_config(); - match Command::new("timeout").args(["3", "ip", "-j", "addr"]).output() { + match Command::new("timeout").args(["2", "ip", "-j", "addr"]).output() { Ok(output) if output.status.success() => { let json_str = String::from_utf8_lossy(&output.stdout); diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index 4e5a629..d8668fe 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -251,18 +251,18 @@ impl SystemdCollector { /// Auto-discover interesting services to monitor fn discover_services_internal(&self) -> Result<(Vec, std::collections::HashMap)> { - // First: Get all service unit files (with 5 second timeout) + // First: Get all service unit files (with 3 second timeout) let unit_files_output = Command::new("timeout") - .args(&["5", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) + .args(&["3", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) .output()?; if !unit_files_output.status.success() { return Err(anyhow::anyhow!("systemctl list-unit-files command failed")); } - // Second: Get runtime status of all units (with 5 second timeout) + // Second: Get runtime status of all units (with 3 second timeout) let units_status_output = Command::new("timeout") - .args(&["5", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) + .args(&["3", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) .output()?; if !units_status_output.status.success() { @@ -358,16 +358,16 @@ impl SystemdCollector { } } - // Fallback to systemctl if not in cache (with 3 second timeout) + // Fallback to systemctl if not in cache (with 2 second timeout) let output = Command::new("timeout") - .args(&["3", "systemctl", "is-active", &format!("{}.service", service)]) + .args(&["2", "systemctl", "is-active", &format!("{}.service", service)]) .output()?; let active_status = String::from_utf8(output.stdout)?.trim().to_string(); - // Get more detailed info (with 3 second timeout) + // Get more detailed info (with 2 second timeout) let output = Command::new("timeout") - .args(&["3", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) + .args(&["2", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) .output()?; let detailed_info = String::from_utf8(output.stdout)?; @@ -427,9 +427,9 @@ impl SystemdCollector { return Ok(0.0); } - // No configured path - try to get WorkingDirectory from systemctl (with 3 second timeout) + // No configured path - try to get WorkingDirectory from systemctl (with 2 second timeout) let output = Command::new("timeout") - .args(&["3", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) + .args(&["2", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) .output() .map_err(|e| CollectorError::SystemRead { path: format!("WorkingDirectory for {}", service_name), @@ -449,15 +449,15 @@ impl SystemdCollector { Ok(0.0) } - /// Get size of a directory in GB (with 5 second timeout) + /// Get size of a directory in GB (with 2 second timeout) async fn get_directory_size(&self, path: &str) -> Option { use super::run_command_with_timeout; - // Use -s (summary) and --apparent-size for speed, 5 second timeout + // Use -s (summary) and --apparent-size for speed, 2 second timeout let mut cmd = Command::new("sudo"); cmd.args(&["du", "-s", "--apparent-size", "--block-size=1", path]); - let output = run_command_with_timeout(cmd, 5).await.ok()?; + let output = run_command_with_timeout(cmd, 2).await.ok()?; if !output.status.success() { // Log permission errors for debugging but don't spam logs @@ -783,9 +783,9 @@ impl SystemdCollector { let mut containers = Vec::new(); // Check if docker is available (cm-agent user is in docker group) - // Use -a to show ALL containers (running and stopped) with 5 second timeout + // Use -a to show ALL containers (running and stopped) with 3 second timeout let output = Command::new("timeout") - .args(&["5", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) + .args(&["3", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) .output(); let output = match output { @@ -826,9 +826,9 @@ impl SystemdCollector { /// Get docker images as sub-services fn get_docker_images(&self) -> Vec<(String, String, String, f32)> { let mut images = Vec::new(); - // Check if docker is available (cm-agent user is in docker group) with 5 second timeout + // Check if docker is available (cm-agent user is in docker group) with 3 second timeout let output = Command::new("timeout") - .args(&["5", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) + .args(&["3", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) .output(); let output = match output {