Reduce all command timeouts to 2-3 seconds max

With 10-second host heartbeat timeout, all command timeouts must be significantly lower to ensure total collection time stays under 10 seconds.

Changed timeouts:
- smartctl: 10s → 3s (critical: multiple drives queried sequentially)
- du: 5s → 2s
- lsblk: 5s → 2s
- systemctl list commands: 5s → 3s
- systemctl show/is-active: 3s → 2s
- docker commands: 5s → 3s
- df, ip commands: 3s → 2s

Total worst-case collection time now capped at more reasonable levels, preventing false host offline alerts from blocking operations.
This commit is contained in:
Christoffer Martinsson 2025-11-27 16:38:54 +01:00
parent 76c04633b5
commit 374b126446
5 changed files with 26 additions and 26 deletions

6
Cargo.lock generated
View File

@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
[[package]] [[package]]
name = "cm-dashboard" name = "cm-dashboard"
version = "0.1.186" version = "0.1.187"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
@ -301,7 +301,7 @@ dependencies = [
[[package]] [[package]]
name = "cm-dashboard-agent" name = "cm-dashboard-agent"
version = "0.1.186" version = "0.1.187"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
@ -324,7 +324,7 @@ dependencies = [
[[package]] [[package]]
name = "cm-dashboard-shared" name = "cm-dashboard-shared"
version = "0.1.186" version = "0.1.187"
dependencies = [ dependencies = [
"chrono", "chrono",
"serde", "serde",

View File

@ -117,7 +117,7 @@ impl DiskCollector {
let mut cmd = Command::new("lsblk"); let mut cmd = Command::new("lsblk");
cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]); cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]);
let output = run_command_with_timeout(cmd, 5).await let output = run_command_with_timeout(cmd, 2).await
.map_err(|e| CollectorError::SystemRead { .map_err(|e| CollectorError::SystemRead {
path: "block devices".to_string(), path: "block devices".to_string(),
error: e.to_string(), error: e.to_string(),
@ -427,7 +427,7 @@ impl DiskCollector {
cmd.args(&["-a", &format!("/dev/{}", drive_name)]); cmd.args(&["-a", &format!("/dev/{}", drive_name)]);
} }
let output = run_command_with_timeout(cmd, 10).await let output = run_command_with_timeout(cmd, 3).await
.map_err(|e| CollectorError::SystemRead { .map_err(|e| CollectorError::SystemRead {
path: format!("SMART data for {}", drive_name), path: format!("SMART data for {}", drive_name),
error: e.to_string(), error: e.to_string(),
@ -764,7 +764,7 @@ impl DiskCollector {
fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result<PoolDrive> { fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result<PoolDrive> {
// Use lsblk to find the backing device with timeout // Use lsblk to find the backing device with timeout
let output = Command::new("timeout") let output = Command::new("timeout")
.args(&["5", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"]) .args(&["2", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"])
.output() .output()
.map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?; .map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?;

View File

@ -105,8 +105,8 @@ impl MemoryCollector {
return Ok(()); return Ok(());
} }
// Get usage data for all tmpfs mounts at once using df (with 3 second timeout) // Get usage data for all tmpfs mounts at once using df (with 2 second timeout)
let mut df_args = vec!["3", "df", "--output=target,size,used", "--block-size=1"]; let mut df_args = vec!["2", "df", "--output=target,size,used", "--block-size=1"];
df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str())); df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str()));
let df_output = std::process::Command::new("timeout") let df_output = std::process::Command::new("timeout")

View File

@ -110,7 +110,7 @@ impl NetworkCollector {
// Parse VLAN configuration // Parse VLAN configuration
let vlan_map = Self::parse_vlan_config(); let vlan_map = Self::parse_vlan_config();
match Command::new("timeout").args(["3", "ip", "-j", "addr"]).output() { match Command::new("timeout").args(["2", "ip", "-j", "addr"]).output() {
Ok(output) if output.status.success() => { Ok(output) if output.status.success() => {
let json_str = String::from_utf8_lossy(&output.stdout); let json_str = String::from_utf8_lossy(&output.stdout);

View File

@ -251,18 +251,18 @@ impl SystemdCollector {
/// Auto-discover interesting services to monitor /// Auto-discover interesting services to monitor
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> { fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
// First: Get all service unit files (with 5 second timeout) // First: Get all service unit files (with 3 second timeout)
let unit_files_output = Command::new("timeout") let unit_files_output = Command::new("timeout")
.args(&["5", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) .args(&["3", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"])
.output()?; .output()?;
if !unit_files_output.status.success() { if !unit_files_output.status.success() {
return Err(anyhow::anyhow!("systemctl list-unit-files command failed")); return Err(anyhow::anyhow!("systemctl list-unit-files command failed"));
} }
// Second: Get runtime status of all units (with 5 second timeout) // Second: Get runtime status of all units (with 3 second timeout)
let units_status_output = Command::new("timeout") let units_status_output = Command::new("timeout")
.args(&["5", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) .args(&["3", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"])
.output()?; .output()?;
if !units_status_output.status.success() { if !units_status_output.status.success() {
@ -358,16 +358,16 @@ impl SystemdCollector {
} }
} }
// Fallback to systemctl if not in cache (with 3 second timeout) // Fallback to systemctl if not in cache (with 2 second timeout)
let output = Command::new("timeout") let output = Command::new("timeout")
.args(&["3", "systemctl", "is-active", &format!("{}.service", service)]) .args(&["2", "systemctl", "is-active", &format!("{}.service", service)])
.output()?; .output()?;
let active_status = String::from_utf8(output.stdout)?.trim().to_string(); let active_status = String::from_utf8(output.stdout)?.trim().to_string();
// Get more detailed info (with 3 second timeout) // Get more detailed info (with 2 second timeout)
let output = Command::new("timeout") let output = Command::new("timeout")
.args(&["3", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) .args(&["2", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"])
.output()?; .output()?;
let detailed_info = String::from_utf8(output.stdout)?; let detailed_info = String::from_utf8(output.stdout)?;
@ -427,9 +427,9 @@ impl SystemdCollector {
return Ok(0.0); return Ok(0.0);
} }
// No configured path - try to get WorkingDirectory from systemctl (with 3 second timeout) // No configured path - try to get WorkingDirectory from systemctl (with 2 second timeout)
let output = Command::new("timeout") let output = Command::new("timeout")
.args(&["3", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) .args(&["2", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
.output() .output()
.map_err(|e| CollectorError::SystemRead { .map_err(|e| CollectorError::SystemRead {
path: format!("WorkingDirectory for {}", service_name), path: format!("WorkingDirectory for {}", service_name),
@ -449,15 +449,15 @@ impl SystemdCollector {
Ok(0.0) Ok(0.0)
} }
/// Get size of a directory in GB (with 5 second timeout) /// Get size of a directory in GB (with 2 second timeout)
async fn get_directory_size(&self, path: &str) -> Option<f32> { async fn get_directory_size(&self, path: &str) -> Option<f32> {
use super::run_command_with_timeout; use super::run_command_with_timeout;
// Use -s (summary) and --apparent-size for speed, 5 second timeout // Use -s (summary) and --apparent-size for speed, 2 second timeout
let mut cmd = Command::new("sudo"); let mut cmd = Command::new("sudo");
cmd.args(&["du", "-s", "--apparent-size", "--block-size=1", path]); cmd.args(&["du", "-s", "--apparent-size", "--block-size=1", path]);
let output = run_command_with_timeout(cmd, 5).await.ok()?; let output = run_command_with_timeout(cmd, 2).await.ok()?;
if !output.status.success() { if !output.status.success() {
// Log permission errors for debugging but don't spam logs // Log permission errors for debugging but don't spam logs
@ -783,9 +783,9 @@ impl SystemdCollector {
let mut containers = Vec::new(); let mut containers = Vec::new();
// Check if docker is available (cm-agent user is in docker group) // Check if docker is available (cm-agent user is in docker group)
// Use -a to show ALL containers (running and stopped) with 5 second timeout // Use -a to show ALL containers (running and stopped) with 3 second timeout
let output = Command::new("timeout") let output = Command::new("timeout")
.args(&["5", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) .args(&["3", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"])
.output(); .output();
let output = match output { let output = match output {
@ -826,9 +826,9 @@ impl SystemdCollector {
/// Get docker images as sub-services /// Get docker images as sub-services
fn get_docker_images(&self) -> Vec<(String, String, String, f32)> { fn get_docker_images(&self) -> Vec<(String, String, String, f32)> {
let mut images = Vec::new(); let mut images = Vec::new();
// Check if docker is available (cm-agent user is in docker group) with 5 second timeout // Check if docker is available (cm-agent user is in docker group) with 3 second timeout
let output = Command::new("timeout") let output = Command::new("timeout")
.args(&["5", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) .args(&["3", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"])
.output(); .output();
let output = match output { let output = match output {