Add comprehensive timeouts to all blocking system commands
Fixes random host disconnections caused by blocking operations preventing timely ZMQ packet transmission. Changes: - Add run_command_with_timeout() wrapper using tokio for async command execution - Apply 10s timeout to smartctl (prevents 30+ second hangs on failing drives) - Apply 5s timeout to du, lsblk, systemctl list commands - Apply 3s timeout to systemctl show/is-active, df, ip commands - Apply 2s timeout to hostname command - Use system 'timeout' command for sync operations where async not needed Critical fixes: - smartctl: Failing drives could block for 30+ seconds per drive - du: Large directories (Docker, PostgreSQL) could block 10-30+ seconds - systemctl/docker: Commands could block indefinitely during system issues With 1-second collection interval and 10-second heartbeat timeout, any blocking operation >10s causes false "host offline" alerts. These timeouts ensure collection completes quickly even during system degradation.
This commit is contained in:
@@ -105,12 +105,12 @@ impl MemoryCollector {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Get usage data for all tmpfs mounts at once using df
|
||||
let mut df_args = vec!["df", "--output=target,size,used", "--block-size=1"];
|
||||
// Get usage data for all tmpfs mounts at once using df (with 3 second timeout)
|
||||
let mut df_args = vec!["3", "df", "--output=target,size,used", "--block-size=1"];
|
||||
df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str()));
|
||||
|
||||
let df_output = std::process::Command::new(df_args[0])
|
||||
.args(&df_args[1..])
|
||||
let df_output = std::process::Command::new("timeout")
|
||||
.args(&df_args[..])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "tmpfs mounts".to_string(),
|
||||
|
||||
Reference in New Issue
Block a user