Compare commits

...

3 Commits

Author SHA1 Message Date
caba78004e Fix empty Storage section by properly aliasing command types
All checks were successful
Build and Release / build-and-release (push) Successful in 2m6s
v0.1.220 broke disk collector by changing the import from
std::process::Command to tokio::process::Command, but lines 193 and
767 explicitly used std::process::Command::new() which silently failed.

Solution: Import both as aliases (TokioCommand/StdCommand) and use
appropriate type for each operation - async commands use TokioCommand
with run_command_with_timeout, sync commands use StdCommand with
system timeout wrapper.

Fixes: Empty Storage section after v0.1.220 deployment
Bump version to v0.1.221
2025-11-29 21:29:33 +01:00
77bf08a978 Fix blocking smartctl commands with proper async/timeout handling
All checks were successful
Build and Release / build-and-release (push) Successful in 2m2s
- Changed disk collector to use tokio::process::Command instead of std::process::Command
- Updated run_command_with_timeout to properly kill processes on timeout
- Fixes issue where smartctl hangs on problematic drives (/dev/sda) freezing entire agent
- Timeout now force-kills hung processes using kill -9, preventing orphaned smartctl processes

This resolves the issue where Data_3 showed unknown status because smartctl was hanging
indefinitely trying to read from a problematic drive, blocking the entire collector.

Bump version to v0.1.220

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 21:09:04 +01:00
929870f8b6 Bump version to v0.1.219
All checks were successful
Build and Release / build-and-release (push) Successful in 1m11s
2025-11-29 18:35:14 +01:00
6 changed files with 34 additions and 21 deletions

6
Cargo.lock generated
View File

@@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
[[package]] [[package]]
name = "cm-dashboard" name = "cm-dashboard"
version = "0.1.217" version = "0.1.220"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
@@ -301,7 +301,7 @@ dependencies = [
[[package]] [[package]]
name = "cm-dashboard-agent" name = "cm-dashboard-agent"
version = "0.1.217" version = "0.1.220"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
@@ -324,7 +324,7 @@ dependencies = [
[[package]] [[package]]
name = "cm-dashboard-shared" name = "cm-dashboard-shared"
version = "0.1.217" version = "0.1.220"
dependencies = [ dependencies = [
"chrono", "chrono",
"serde", "serde",

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "cm-dashboard-agent" name = "cm-dashboard-agent"
version = "0.1.218" version = "0.1.221"
edition = "2021" edition = "2021"
[dependencies] [dependencies]

View File

@@ -3,7 +3,8 @@ use async_trait::async_trait;
use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status}; use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status};
use crate::config::DiskConfig; use crate::config::DiskConfig;
use std::process::Command; use tokio::process::Command as TokioCommand;
use std::process::Command as StdCommand;
use std::time::Instant; use std::time::Instant;
use std::collections::HashMap; use std::collections::HashMap;
use tracing::debug; use tracing::debug;
@@ -114,7 +115,7 @@ impl DiskCollector {
async fn get_mount_devices(&self) -> Result<HashMap<String, String>, CollectorError> { async fn get_mount_devices(&self) -> Result<HashMap<String, String>, CollectorError> {
use super::run_command_with_timeout; use super::run_command_with_timeout;
let mut cmd = Command::new("lsblk"); let mut cmd = TokioCommand::new("lsblk");
cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]); cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]);
let output = run_command_with_timeout(cmd, 2).await let output = run_command_with_timeout(cmd, 2).await
@@ -189,7 +190,7 @@ impl DiskCollector {
/// Get filesystem info for a single mount point /// Get filesystem info for a single mount point
fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> { fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> {
let output = std::process::Command::new("timeout") let output = StdCommand::new("timeout")
.args(&["2", "df", "--block-size=1", mount_point]) .args(&["2", "df", "--block-size=1", mount_point])
.output() .output()
.map_err(|e| CollectorError::SystemRead { .map_err(|e| CollectorError::SystemRead {
@@ -420,7 +421,7 @@ impl DiskCollector {
// Use direct smartctl (no sudo) - service has CAP_SYS_RAWIO and CAP_SYS_ADMIN capabilities // Use direct smartctl (no sudo) - service has CAP_SYS_RAWIO and CAP_SYS_ADMIN capabilities
// For NVMe drives, specify device type explicitly // For NVMe drives, specify device type explicitly
let mut cmd = Command::new("smartctl"); let mut cmd = TokioCommand::new("smartctl");
if drive_name.starts_with("nvme") { if drive_name.starts_with("nvme") {
cmd.args(&["-d", "nvme", "-a", &format!("/dev/{}", drive_name)]); cmd.args(&["-d", "nvme", "-a", &format!("/dev/{}", drive_name)]);
} else { } else {
@@ -763,7 +764,7 @@ impl DiskCollector {
/// Get drive information for a mount path /// Get drive information for a mount path
fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result<PoolDrive> { fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result<PoolDrive> {
// Use lsblk to find the backing device with timeout // Use lsblk to find the backing device with timeout
let output = Command::new("timeout") let output = StdCommand::new("timeout")
.args(&["2", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"]) .args(&["2", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"])
.output() .output()
.map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?; .map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?;

View File

@@ -1,8 +1,7 @@
use async_trait::async_trait; use async_trait::async_trait;
use cm_dashboard_shared::{AgentData}; use cm_dashboard_shared::{AgentData};
use std::process::{Command, Output}; use std::process::Output;
use std::time::Duration; use std::time::Duration;
use tokio::time::timeout;
pub mod backup; pub mod backup;
pub mod cpu; pub mod cpu;
@@ -16,16 +15,29 @@ pub mod systemd;
pub use error::CollectorError; pub use error::CollectorError;
/// Run a command with a timeout to prevent blocking /// Run a command with a timeout to prevent blocking
pub async fn run_command_with_timeout(mut cmd: Command, timeout_secs: u64) -> std::io::Result<Output> { /// Properly kills the process if timeout is exceeded
pub async fn run_command_with_timeout(mut cmd: tokio::process::Command, timeout_secs: u64) -> std::io::Result<Output> {
use tokio::time::timeout;
let timeout_duration = Duration::from_secs(timeout_secs); let timeout_duration = Duration::from_secs(timeout_secs);
match timeout(timeout_duration, tokio::task::spawn_blocking(move || cmd.output())).await { let child = cmd.spawn()?;
Ok(Ok(result)) => result, let pid = child.id();
Ok(Err(e)) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
Err(_) => Err(std::io::Error::new( match timeout(timeout_duration, child.wait_with_output()).await {
std::io::ErrorKind::TimedOut, Ok(result) => result,
format!("Command timed out after {} seconds", timeout_secs) Err(_) => {
)), // Timeout - force kill the process using system kill command
if let Some(process_id) = pid {
let _ = tokio::process::Command::new("kill")
.args(&["-9", &process_id.to_string()])
.output()
.await;
}
Err(std::io::Error::new(
std::io::ErrorKind::TimedOut,
format!("Command timed out after {} seconds", timeout_secs)
))
}
} }
} }

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "cm-dashboard" name = "cm-dashboard"
version = "0.1.218" version = "0.1.221"
edition = "2021" edition = "2021"
[dependencies] [dependencies]

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "cm-dashboard-shared" name = "cm-dashboard-shared"
version = "0.1.218" version = "0.1.221"
edition = "2021" edition = "2021"
[dependencies] [dependencies]