Add comprehensive timeouts to all blocking system commands
Fixes random host disconnections caused by blocking operations preventing timely ZMQ packet transmission. Changes: - Add run_command_with_timeout() wrapper using tokio for async command execution - Apply 10s timeout to smartctl (prevents 30+ second hangs on failing drives) - Apply 5s timeout to du, lsblk, systemctl list commands - Apply 3s timeout to systemctl show/is-active, df, ip commands - Apply 2s timeout to hostname command - Use system 'timeout' command for sync operations where async not needed Critical fixes: - smartctl: Failing drives could block for 30+ seconds per drive - du: Large directories (Docker, PostgreSQL) could block 10-30+ seconds - systemctl/docker: Commands could block indefinitely during system issues With 1-second collection interval and 10-second heartbeat timeout, any blocking operation >10s causes false "host offline" alerts. These timeouts ensure collection completes quickly even during system degradation.
This commit is contained in:
@@ -4,7 +4,7 @@ use cm_dashboard_shared::{AgentData, ServiceData, SubServiceData, SubServiceMetr
|
||||
use std::process::Command;
|
||||
use std::sync::RwLock;
|
||||
use std::time::Instant;
|
||||
use tracing::debug;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::SystemdConfig;
|
||||
@@ -251,18 +251,18 @@ impl SystemdCollector {
|
||||
|
||||
/// Auto-discover interesting services to monitor
|
||||
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
|
||||
// First: Get all service unit files
|
||||
let unit_files_output = Command::new("systemctl")
|
||||
.args(&["list-unit-files", "--type=service", "--no-pager", "--plain"])
|
||||
// First: Get all service unit files (with 5 second timeout)
|
||||
let unit_files_output = Command::new("timeout")
|
||||
.args(&["5", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"])
|
||||
.output()?;
|
||||
|
||||
if !unit_files_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl list-unit-files command failed"));
|
||||
}
|
||||
|
||||
// Second: Get runtime status of all units
|
||||
let units_status_output = Command::new("systemctl")
|
||||
.args(&["list-units", "--type=service", "--all", "--no-pager", "--plain"])
|
||||
// Second: Get runtime status of all units (with 5 second timeout)
|
||||
let units_status_output = Command::new("timeout")
|
||||
.args(&["5", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"])
|
||||
.output()?;
|
||||
|
||||
if !units_status_output.status.success() {
|
||||
@@ -358,16 +358,16 @@ impl SystemdCollector {
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to systemctl if not in cache
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["is-active", &format!("{}.service", service)])
|
||||
// Fallback to systemctl if not in cache (with 3 second timeout)
|
||||
let output = Command::new("timeout")
|
||||
.args(&["3", "systemctl", "is-active", &format!("{}.service", service)])
|
||||
.output()?;
|
||||
|
||||
let active_status = String::from_utf8(output.stdout)?.trim().to_string();
|
||||
|
||||
// Get more detailed info
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"])
|
||||
// Get more detailed info (with 3 second timeout)
|
||||
let output = Command::new("timeout")
|
||||
.args(&["3", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"])
|
||||
.output()?;
|
||||
|
||||
let detailed_info = String::from_utf8(output.stdout)?;
|
||||
@@ -419,7 +419,7 @@ impl SystemdCollector {
|
||||
if let Some(dirs) = self.config.service_directories.get(service_name) {
|
||||
// Service has configured paths - use the first accessible one
|
||||
for dir in dirs {
|
||||
if let Some(size) = self.get_directory_size(dir) {
|
||||
if let Some(size) = self.get_directory_size(dir).await {
|
||||
return Ok(size);
|
||||
}
|
||||
}
|
||||
@@ -427,9 +427,9 @@ impl SystemdCollector {
|
||||
return Ok(0.0);
|
||||
}
|
||||
|
||||
// No configured path - try to get WorkingDirectory from systemctl
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
|
||||
// No configured path - try to get WorkingDirectory from systemctl (with 3 second timeout)
|
||||
let output = Command::new("timeout")
|
||||
.args(&["3", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("WorkingDirectory for {}", service_name),
|
||||
@@ -441,7 +441,7 @@ impl SystemdCollector {
|
||||
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||
let dir = line.strip_prefix("WorkingDirectory=").unwrap_or("");
|
||||
if !dir.is_empty() && dir != "/" {
|
||||
return Ok(self.get_directory_size(dir).unwrap_or(0.0));
|
||||
return Ok(self.get_directory_size(dir).await.unwrap_or(0.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -449,18 +449,23 @@ impl SystemdCollector {
|
||||
Ok(0.0)
|
||||
}
|
||||
|
||||
/// Get size of a directory in GB
|
||||
fn get_directory_size(&self, path: &str) -> Option<f32> {
|
||||
let output = Command::new("sudo")
|
||||
.args(&["du", "-sb", path])
|
||||
.output()
|
||||
.ok()?;
|
||||
/// Get size of a directory in GB (with 5 second timeout)
|
||||
async fn get_directory_size(&self, path: &str) -> Option<f32> {
|
||||
use super::run_command_with_timeout;
|
||||
|
||||
// Use -s (summary) and --apparent-size for speed, 5 second timeout
|
||||
let mut cmd = Command::new("sudo");
|
||||
cmd.args(&["du", "-s", "--apparent-size", "--block-size=1", path]);
|
||||
|
||||
let output = run_command_with_timeout(cmd, 5).await.ok()?;
|
||||
|
||||
if !output.status.success() {
|
||||
// Log permission errors for debugging but don't spam logs
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
if stderr.contains("Permission denied") {
|
||||
debug!("Permission denied accessing directory: {}", path);
|
||||
} else if stderr.contains("timed out") {
|
||||
warn!("Directory size check timed out for {}", path);
|
||||
} else {
|
||||
debug!("Failed to get size for directory {}: {}", path, stderr);
|
||||
}
|
||||
@@ -778,9 +783,9 @@ impl SystemdCollector {
|
||||
let mut containers = Vec::new();
|
||||
|
||||
// Check if docker is available (cm-agent user is in docker group)
|
||||
// Use -a to show ALL containers (running and stopped)
|
||||
let output = Command::new("docker")
|
||||
.args(&["ps", "-a", "--format", "{{.Names}},{{.Status}}"])
|
||||
// Use -a to show ALL containers (running and stopped) with 5 second timeout
|
||||
let output = Command::new("timeout")
|
||||
.args(&["5", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"])
|
||||
.output();
|
||||
|
||||
let output = match output {
|
||||
@@ -821,9 +826,9 @@ impl SystemdCollector {
|
||||
/// Get docker images as sub-services
|
||||
fn get_docker_images(&self) -> Vec<(String, String, String, f32)> {
|
||||
let mut images = Vec::new();
|
||||
// Check if docker is available (cm-agent user is in docker group)
|
||||
let output = Command::new("docker")
|
||||
.args(&["images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"])
|
||||
// Check if docker is available (cm-agent user is in docker group) with 5 second timeout
|
||||
let output = Command::new("timeout")
|
||||
.args(&["5", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"])
|
||||
.output();
|
||||
|
||||
let output = match output {
|
||||
|
||||
Reference in New Issue
Block a user