From 77bf08a9786a3f4c065de60300eb381779138acd Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Sat, 29 Nov 2025 21:09:04 +0100 Subject: [PATCH] Fix blocking smartctl commands with proper async/timeout handling - Changed disk collector to use tokio::process::Command instead of std::process::Command - Updated run_command_with_timeout to properly kill processes on timeout - Fixes issue where smartctl hangs on problematic drives (/dev/sda) freezing entire agent - Timeout now force-kills hung processes using kill -9, preventing orphaned smartctl processes This resolves the issue where Data_3 showed unknown status because smartctl was hanging indefinitely trying to read from a problematic drive, blocking the entire collector. Bump version to v0.1.220 Co-Authored-By: Claude --- Cargo.lock | 6 +++--- agent/Cargo.toml | 2 +- agent/src/collectors/disk.rs | 4 ++-- agent/src/collectors/mod.rs | 32 ++++++++++++++++++++++---------- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 6 files changed, 30 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e62a609..7b26541 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.217" +version = "0.1.219" dependencies = [ "anyhow", "chrono", @@ -301,7 +301,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.217" +version = "0.1.219" dependencies = [ "anyhow", "async-trait", @@ -324,7 +324,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.217" +version = "0.1.219" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 750977c..19a1dd9 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.219" +version = "0.1.220" edition = "2021" [dependencies] diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index 71c53cf..cd3af2c 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -3,7 +3,7 @@ use async_trait::async_trait; use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status}; use crate::config::DiskConfig; -use std::process::Command; +use tokio::process::Command; use std::time::Instant; use std::collections::HashMap; use tracing::debug; @@ -763,7 +763,7 @@ impl DiskCollector { /// Get drive information for a mount path fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result { // Use lsblk to find the backing device with timeout - let output = Command::new("timeout") + let output = std::process::Command::new("timeout") .args(&["2", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"]) .output() .map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?; diff --git a/agent/src/collectors/mod.rs b/agent/src/collectors/mod.rs index db338f3..2f3027e 100644 --- a/agent/src/collectors/mod.rs +++ b/agent/src/collectors/mod.rs @@ -1,8 +1,7 @@ use async_trait::async_trait; use cm_dashboard_shared::{AgentData}; -use std::process::{Command, Output}; +use std::process::Output; use std::time::Duration; -use tokio::time::timeout; pub mod backup; pub mod cpu; @@ -16,16 +15,29 @@ pub mod systemd; pub use error::CollectorError; /// Run a command with a timeout to prevent blocking -pub async fn run_command_with_timeout(mut cmd: Command, timeout_secs: u64) -> std::io::Result { +/// Properly kills the process if timeout is exceeded +pub async fn run_command_with_timeout(mut cmd: tokio::process::Command, timeout_secs: u64) -> std::io::Result { + use tokio::time::timeout; let timeout_duration = Duration::from_secs(timeout_secs); - match timeout(timeout_duration, tokio::task::spawn_blocking(move || cmd.output())).await { - Ok(Ok(result)) => result, - Ok(Err(e)) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)), - Err(_) => Err(std::io::Error::new( - std::io::ErrorKind::TimedOut, - format!("Command timed out after {} seconds", timeout_secs) - )), + let child = cmd.spawn()?; + let pid = child.id(); + + match timeout(timeout_duration, child.wait_with_output()).await { + Ok(result) => result, + Err(_) => { + // Timeout - force kill the process using system kill command + if let Some(process_id) = pid { + let _ = tokio::process::Command::new("kill") + .args(&["-9", &process_id.to_string()]) + .output() + .await; + } + Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + format!("Command timed out after {} seconds", timeout_secs) + )) + } } } diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 696be87..577f153 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.219" +version = "0.1.220" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 4f85700..61cb8d7 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.219" +version = "0.1.220" edition = "2021" [dependencies]