Compare commits

...

7 Commits

Author SHA1 Message Date
8a0e68f0e3 Fix Data_3 timeout by parallelizing SMART collection
All checks were successful
Build and Release / build-and-release (push) Successful in 1m10s
Root cause: SMART data was collected sequentially, one drive at a time.
With 5 drives taking ~500ms each, total collection time was 2.5+ seconds.
When disk collector runs every 1 second, this caused overlapping
collections creating resource contention. The last drive (sda/Data_3)
would timeout due to the drive being accessed by the previous collection.

Solution: Query all drives in parallel using futures::join_all. Now all
drives get their SMART data collected simultaneously with independent
3-second timeouts, eliminating contention and reducing total collection
time from 2.5+ seconds to ~500ms (the slowest single drive).

Benefits:
- All drives complete in ~500ms instead of 2.5+ seconds
- No overlapping collections causing resource contention
- Each drive gets full 3-second timeout window
- sda/Data_3 should now show temperature and serial number

Bump version to v0.1.223
2025-11-29 23:51:43 +01:00
2d653fe9ae Fix empty Storage section by configuring stdio pipes
All checks were successful
Build and Release / build-and-release (push) Successful in 1m15s
Root cause: run_command_with_timeout() was calling cmd.spawn() without
configuring stdout/stderr pipes. This caused command output to go to
journald instead of being captured by wait_with_output(). The disk
collector received empty output and failed silently.

Solution: Configure stdout(Stdio::piped()) and stderr(Stdio::piped())
before spawning commands. This ensures wait_with_output() can properly
capture command output.

Fixes: Empty Storage section, lsblk output appearing in journald
Bump version to v0.1.222
2025-11-29 23:25:17 +01:00
caba78004e Fix empty Storage section by properly aliasing command types
All checks were successful
Build and Release / build-and-release (push) Successful in 2m6s
v0.1.220 broke disk collector by changing the import from
std::process::Command to tokio::process::Command, but lines 193 and
767 explicitly used std::process::Command::new() which silently failed.

Solution: Import both as aliases (TokioCommand/StdCommand) and use
appropriate type for each operation - async commands use TokioCommand
with run_command_with_timeout, sync commands use StdCommand with
system timeout wrapper.

Fixes: Empty Storage section after v0.1.220 deployment
Bump version to v0.1.221
2025-11-29 21:29:33 +01:00
77bf08a978 Fix blocking smartctl commands with proper async/timeout handling
All checks were successful
Build and Release / build-and-release (push) Successful in 2m2s
- Changed disk collector to use tokio::process::Command instead of std::process::Command
- Updated run_command_with_timeout to properly kill processes on timeout
- Fixes issue where smartctl hangs on problematic drives (/dev/sda) freezing entire agent
- Timeout now force-kills hung processes using kill -9, preventing orphaned smartctl processes

This resolves the issue where Data_3 showed unknown status because smartctl was hanging
indefinitely trying to read from a problematic drive, blocking the entire collector.

Bump version to v0.1.220

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 21:09:04 +01:00
929870f8b6 Bump version to v0.1.219
All checks were successful
Build and Release / build-and-release (push) Successful in 1m11s
2025-11-29 18:35:14 +01:00
7aae852b7b Bump version to v0.1.218
All checks were successful
Build and Release / build-and-release (push) Successful in 1m19s
2025-11-29 17:59:33 +01:00
40f3ff66d8 Show archive count range to detect inconsistencies
- Display single number if all services have same count
- Display min-max range if counts differ (indicates problem)
2025-11-29 17:59:24 +01:00
9 changed files with 121 additions and 32 deletions

48
Cargo.lock generated
View File

@@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
[[package]]
name = "cm-dashboard"
version = "0.1.216"
version = "0.1.222"
dependencies = [
"anyhow",
"chrono",
@@ -301,7 +301,7 @@ dependencies = [
[[package]]
name = "cm-dashboard-agent"
version = "0.1.216"
version = "0.1.222"
dependencies = [
"anyhow",
"async-trait",
@@ -309,6 +309,7 @@ dependencies = [
"chrono-tz",
"clap",
"cm-dashboard-shared",
"futures",
"gethostname",
"lettre",
"reqwest",
@@ -324,7 +325,7 @@ dependencies = [
[[package]]
name = "cm-dashboard-shared"
version = "0.1.216"
version = "0.1.222"
dependencies = [
"chrono",
"serde",
@@ -552,6 +553,21 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
@@ -559,6 +575,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
@@ -567,12 +584,34 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
@@ -591,8 +630,11 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",

View File

@@ -1,6 +1,6 @@
[package]
name = "cm-dashboard-agent"
version = "0.1.217"
version = "0.1.223"
edition = "2021"
[dependencies]
@@ -20,4 +20,5 @@ gethostname = { workspace = true }
chrono-tz = "0.8"
toml = { workspace = true }
async-trait = "0.1"
reqwest = { version = "0.11", features = ["json", "blocking"] }
reqwest = { version = "0.11", features = ["json", "blocking"] }
futures = "0.3"

View File

@@ -142,12 +142,17 @@ impl BackupCollector {
// Build service list for this disk
let services: Vec<String> = backup_status.services.keys().cloned().collect();
// Get archive count per service (use minimum to show if any service has fewer backups)
let total_archives: i64 = backup_status.services.values()
// Get min and max archive counts to detect inconsistencies
let archives_min: i64 = backup_status.services.values()
.map(|service| service.archive_count)
.min()
.unwrap_or(0);
let archives_max: i64 = backup_status.services.values()
.map(|service| service.archive_count)
.max()
.unwrap_or(0);
// Create disk data
let disk_data = BackupDiskData {
serial: backup_status.disk_serial_number.unwrap_or_else(|| "Unknown".to_string()),
@@ -161,7 +166,8 @@ impl BackupCollector {
disk_total_gb: total_gb,
usage_status,
services,
total_archives,
archives_min,
archives_max,
};
disks.push(disk_data);

View File

@@ -3,7 +3,8 @@ use async_trait::async_trait;
use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status};
use crate::config::DiskConfig;
use std::process::Command;
use tokio::process::Command as TokioCommand;
use std::process::Command as StdCommand;
use std::time::Instant;
use std::collections::HashMap;
use tracing::debug;
@@ -114,7 +115,7 @@ impl DiskCollector {
async fn get_mount_devices(&self) -> Result<HashMap<String, String>, CollectorError> {
use super::run_command_with_timeout;
let mut cmd = Command::new("lsblk");
let mut cmd = TokioCommand::new("lsblk");
cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]);
let output = run_command_with_timeout(cmd, 2).await
@@ -189,7 +190,7 @@ impl DiskCollector {
/// Get filesystem info for a single mount point
fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> {
let output = std::process::Command::new("timeout")
let output = StdCommand::new("timeout")
.args(&["2", "df", "--block-size=1", mount_point])
.output()
.map_err(|e| CollectorError::SystemRead {
@@ -386,9 +387,9 @@ impl DiskCollector {
device.to_string()
}
/// Get SMART data for drives
/// Get SMART data for drives in parallel
async fn get_smart_data_for_drives(&self, physical_drives: &[PhysicalDrive], mergerfs_pools: &[MergerfsPool]) -> HashMap<String, SmartData> {
let mut smart_data = HashMap::new();
use futures::future::join_all;
// Collect all drive names
let mut all_drives = std::collections::HashSet::new();
@@ -404,9 +405,24 @@ impl DiskCollector {
}
}
// Get SMART data for each drive
for drive_name in all_drives {
if let Ok(data) = self.get_smart_data(&drive_name).await {
// Collect SMART data for all drives in parallel
let futures: Vec<_> = all_drives
.iter()
.map(|drive_name| {
let drive = drive_name.clone();
async move {
let result = self.get_smart_data(&drive).await;
(drive, result)
}
})
.collect();
let results = join_all(futures).await;
// Build HashMap from results
let mut smart_data = HashMap::new();
for (drive_name, result) in results {
if let Ok(data) = result {
smart_data.insert(drive_name, data);
}
}
@@ -420,7 +436,7 @@ impl DiskCollector {
// Use direct smartctl (no sudo) - service has CAP_SYS_RAWIO and CAP_SYS_ADMIN capabilities
// For NVMe drives, specify device type explicitly
let mut cmd = Command::new("smartctl");
let mut cmd = TokioCommand::new("smartctl");
if drive_name.starts_with("nvme") {
cmd.args(&["-d", "nvme", "-a", &format!("/dev/{}", drive_name)]);
} else {
@@ -763,7 +779,7 @@ impl DiskCollector {
/// Get drive information for a mount path
fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result<PoolDrive> {
// Use lsblk to find the backing device with timeout
let output = Command::new("timeout")
let output = StdCommand::new("timeout")
.args(&["2", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"])
.output()
.map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?;

View File

@@ -1,8 +1,7 @@
use async_trait::async_trait;
use cm_dashboard_shared::{AgentData};
use std::process::{Command, Output};
use std::process::Output;
use std::time::Duration;
use tokio::time::timeout;
pub mod backup;
pub mod cpu;
@@ -16,16 +15,34 @@ pub mod systemd;
pub use error::CollectorError;
/// Run a command with a timeout to prevent blocking
pub async fn run_command_with_timeout(mut cmd: Command, timeout_secs: u64) -> std::io::Result<Output> {
/// Properly kills the process if timeout is exceeded
pub async fn run_command_with_timeout(mut cmd: tokio::process::Command, timeout_secs: u64) -> std::io::Result<Output> {
use tokio::time::timeout;
use std::process::Stdio;
let timeout_duration = Duration::from_secs(timeout_secs);
match timeout(timeout_duration, tokio::task::spawn_blocking(move || cmd.output())).await {
Ok(Ok(result)) => result,
Ok(Err(e)) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
Err(_) => Err(std::io::Error::new(
std::io::ErrorKind::TimedOut,
format!("Command timed out after {} seconds", timeout_secs)
)),
// Configure stdio to capture output
cmd.stdout(Stdio::piped());
cmd.stderr(Stdio::piped());
let child = cmd.spawn()?;
let pid = child.id();
match timeout(timeout_duration, child.wait_with_output()).await {
Ok(result) => result,
Err(_) => {
// Timeout - force kill the process using system kill command
if let Some(process_id) = pid {
let _ = tokio::process::Command::new("kill")
.args(&["-9", &process_id.to_string()])
.output()
.await;
}
Err(std::io::Error::new(
std::io::ErrorKind::TimedOut,
format!("Command timed out after {} seconds", timeout_secs)
))
}
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "cm-dashboard"
version = "0.1.217"
version = "0.1.223"
edition = "2021"
[dependencies]

View File

@@ -566,9 +566,15 @@ impl SystemWidget {
}
// Show usage with status and archive count
let archive_display = if disk.archives_min == disk.archives_max {
format!("{}", disk.archives_min)
} else {
format!("{}-{}", disk.archives_min, disk.archives_max)
};
let usage_text = format!(
"Usage: ({}) {:.0}% {:.0}GB/{:.0}GB",
disk.total_archives,
archive_display,
disk.disk_usage_percent,
disk.disk_used_gb,
disk.disk_total_gb

View File

@@ -1,6 +1,6 @@
[package]
name = "cm-dashboard-shared"
version = "0.1.217"
version = "0.1.223"
edition = "2021"
[dependencies]

View File

@@ -195,7 +195,8 @@ pub struct BackupDiskData {
pub disk_total_gb: f32,
pub usage_status: Status,
pub services: Vec<String>,
pub total_archives: i64,
pub archives_min: i64,
pub archives_max: i64,
}
impl AgentData {