Add comprehensive timeouts to all blocking system commands
Fixes random host disconnections caused by blocking operations preventing timely ZMQ packet transmission. Changes: - Add run_command_with_timeout() wrapper using tokio for async command execution - Apply 10s timeout to smartctl (prevents 30+ second hangs on failing drives) - Apply 5s timeout to du, lsblk, systemctl list commands - Apply 3s timeout to systemctl show/is-active, df, ip commands - Apply 2s timeout to hostname command - Use system 'timeout' command for sync operations where async not needed Critical fixes: - smartctl: Failing drives could block for 30+ seconds per drive - du: Large directories (Docker, PostgreSQL) could block 10-30+ seconds - systemctl/docker: Commands could block indefinitely during system issues With 1-second collection interval and 10-second heartbeat timeout, any blocking operation >10s causes false "host offline" alerts. These timeouts ensure collection completes quickly even during system degradation.
This commit is contained in:
@@ -51,7 +51,7 @@ impl NetworkCollector {
|
||||
|
||||
/// Get the primary physical interface (the one with default route)
|
||||
fn get_primary_physical_interface() -> Option<String> {
|
||||
match Command::new("ip").args(["route", "show", "default"]).output() {
|
||||
match Command::new("timeout").args(["2", "ip", "route", "show", "default"]).output() {
|
||||
Ok(output) if output.status.success() => {
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
// Parse: "default via 192.168.1.1 dev eno1 ..."
|
||||
@@ -110,7 +110,7 @@ impl NetworkCollector {
|
||||
// Parse VLAN configuration
|
||||
let vlan_map = Self::parse_vlan_config();
|
||||
|
||||
match Command::new("ip").args(["-j", "addr"]).output() {
|
||||
match Command::new("timeout").args(["3", "ip", "-j", "addr"]).output() {
|
||||
Ok(output) if output.status.success() => {
|
||||
let json_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user