Remove debug logging from disk collector

Removed all debug! statements from disk collector to reduce log noise. Bump version to v0.1.226
Fix Data_3 showing as unknown by handling smartctl warning exit codes
2025-11-30 00:44:38 +01:00 · 2025-11-30 00:35:19 +01:00 · 2025-11-30 00:14:25 +01:00 · 2025-11-29 23:51:43 +01:00
5 changed files with 84 additions and 45 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"

 [[package]]
 name = "cm-dashboard"
-version = "0.1.221"
+version = "0.1.225"
 dependencies = [
 "anyhow",
 "chrono",
@@ -301,7 +301,7 @@ dependencies = [

 [[package]]
 name = "cm-dashboard-agent"
-version = "0.1.221"
+version = "0.1.225"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -309,6 +309,7 @@ dependencies = [
 "chrono-tz",
 "clap",
 "cm-dashboard-shared",
+ "futures",
 "gethostname",
 "lettre",
 "reqwest",
@@ -324,7 +325,7 @@ dependencies = [

 [[package]]
 name = "cm-dashboard-shared"
-version = "0.1.221"
+version = "0.1.225"
 dependencies = [
 "chrono",
 "serde",
@@ -552,6 +553,21 @@ dependencies = [
 "percent-encoding",
 ]

+[[package]]
+name = "futures"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
 [[package]]
 name = "futures-channel"
 version = "0.3.31"
@@ -559,6 +575,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
 dependencies = [
 "futures-core",
+ "futures-sink",
 ]

 [[package]]
@@ -567,12 +584,34 @@ version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"

+[[package]]
+name = "futures-executor"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
 [[package]]
 name = "futures-io"
 version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"

+[[package]]
+name = "futures-macro"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "futures-sink"
 version = "0.3.31"
@@ -591,8 +630,11 @@ version = "0.3.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
 dependencies = [
+ "futures-channel",
 "futures-core",
 "futures-io",
+ "futures-macro",
+ "futures-sink",
 "futures-task",
 "memchr",
 "pin-project-lite",
--- a/agent/Cargo.toml
+++ b/agent/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cm-dashboard-agent"
-version = "0.1.222"
+version = "0.1.226"
 edition = "2021"

 [dependencies]
@@ -20,4 +20,5 @@ gethostname = { workspace = true }
 chrono-tz = "0.8"
 toml = { workspace = true }
 async-trait = "0.1"
-reqwest = { version = "0.11", features = ["json", "blocking"] }
+reqwest = { version = "0.11", features = ["json", "blocking"] }
+futures = "0.3"
--- a/agent/src/collectors/disk.rs
+++ b/agent/src/collectors/disk.rs
@@ -5,9 +5,7 @@ use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, Hyster
 use crate::config::DiskConfig;
 use tokio::process::Command as TokioCommand;
 use std::process::Command as StdCommand;
-use std::time::Instant;
 use std::collections::HashMap;
-use tracing::debug;

 use super::{Collector, CollectorError};

@@ -68,9 +66,6 @@ impl DiskCollector {

    /// Collect all storage data and populate AgentData
    async fn collect_storage_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
-        let start_time = Instant::now();
-        debug!("Starting clean storage collection");
-
        // Step 1: Get mount points and their backing devices
        let mount_devices = self.get_mount_devices().await?;
        
@@ -105,9 +100,6 @@ impl DiskCollector {
        self.populate_drives_data(&physical_drives, &smart_data, agent_data)?;
        self.populate_pools_data(&mergerfs_pools, &smart_data, agent_data)?;

-        let elapsed = start_time.elapsed();
-        debug!("Storage collection completed in {:?}", elapsed);
-
        Ok(())
    }

@@ -142,7 +134,6 @@ impl DiskCollector {
            }
        }

-        debug!("Found {} mounted block devices", mount_devices.len());
        Ok(mount_devices)
    }

@@ -155,8 +146,8 @@ impl DiskCollector {
                Ok((total, used)) => {
                    filesystem_usage.insert(mount_point.clone(), (total, used));
                }
-                Err(e) => {
-                    debug!("Failed to get filesystem info for {}: {}", mount_point, e);
+                Err(_e) => {
+                    // Silently skip filesystems we can't read
                }
            }
        }
@@ -177,8 +168,6 @@ impl DiskCollector {
                // Only add if we don't already have usage data for this mount point
                if !filesystem_usage.contains_key(&mount_point) {
                    if let Ok((total, used)) = self.get_filesystem_info(&mount_point) {
-                        debug!("Added MergerFS filesystem usage for {}: {}GB total, {}GB used", 
-                            mount_point, total as f32 / (1024.0 * 1024.0 * 1024.0), used as f32 / (1024.0 * 1024.0 * 1024.0));
                        filesystem_usage.insert(mount_point, (total, used));
                    }
                }
@@ -253,9 +242,8 @@ impl DiskCollector {
                } else {
                    mount_point.trim_start_matches('/').replace('/', "_")
                };
-                
+
                if pool_name.is_empty() {
-                    debug!("Skipping mergerfs pool with empty name: {}", mount_point);
                    continue;
                }
                
@@ -283,8 +271,7 @@ impl DiskCollector {
                // Categorize as data vs parity drives
                let (data_drives, parity_drives) = match self.categorize_pool_drives(&all_member_paths) {
                    Ok(drives) => drives,
-                    Err(e) => {
-                        debug!("Failed to categorize drives for pool {}: {}. Skipping.", mount_point, e);
+                    Err(_e) => {
                        continue;
                    }
                };
@@ -299,8 +286,7 @@ impl DiskCollector {
                });
            }
        }
-        
-        debug!("Found {} mergerfs pools", pools.len());
+
        Ok(pools)
    }

@@ -387,9 +373,9 @@ impl DiskCollector {
        device.to_string()
    }

-    /// Get SMART data for drives
+    /// Get SMART data for drives in parallel
    async fn get_smart_data_for_drives(&self, physical_drives: &[PhysicalDrive], mergerfs_pools: &[MergerfsPool]) -> HashMap<String, SmartData> {
-        let mut smart_data = HashMap::new();
+        use futures::future::join_all;

        // Collect all drive names
        let mut all_drives = std::collections::HashSet::new();
@@ -405,9 +391,24 @@ impl DiskCollector {
            }
        }

-        // Get SMART data for each drive
-        for drive_name in all_drives {
-            if let Ok(data) = self.get_smart_data(&drive_name).await {
+        // Collect SMART data for all drives in parallel
+        let futures: Vec<_> = all_drives
+            .iter()
+            .map(|drive_name| {
+                let drive = drive_name.clone();
+                async move {
+                    let result = self.get_smart_data(&drive).await;
+                    (drive, result)
+                }
+            })
+            .collect();
+
+        let results = join_all(futures).await;
+
+        // Build HashMap from results
+        let mut smart_data = HashMap::new();
+        for (drive_name, result) in results {
+            if let Ok(data) = result {
                smart_data.insert(drive_name, data);
            }
        }
@@ -436,8 +437,10 @@ impl DiskCollector {

        let output_str = String::from_utf8_lossy(&output.stdout);

-        if !output.status.success() {
-            // Return unknown data rather than failing completely
+        // Note: smartctl returns non-zero exit codes for warnings (like exit code 32
+        // for "temperature was high in the past"), but the output data is still valid.
+        // Only check if we got any output at all, don't reject based on exit code.
+        if output_str.is_empty() {
            return Ok(SmartData {
                health: "UNKNOWN".to_string(),
                serial_number: None,
@@ -445,7 +448,7 @@ impl DiskCollector {
                wear_percent: None,
            });
        }
-        
+
        let mut health = "UNKNOWN".to_string();
        let mut serial_number = None;
        let mut temperature = None;
@@ -786,20 +789,13 @@ impl DiskCollector {
        
        // Extract base device name (e.g., "sda1" -> "sda")
        let base_device = self.extract_base_device(&format!("/dev/{}", device));
-        
-        // Get temperature from SMART data if available
-        let temperature = if let Ok(smart_data) = tokio::task::block_in_place(|| {
-            tokio::runtime::Handle::current().block_on(self.get_smart_data(&base_device))
-        }) {
-            smart_data.temperature_celsius
-        } else {
-            None
-        };
-        
+
+        // Temperature will be filled in later from parallel SMART collection
+        // Don't collect it here to avoid sequential blocking with problematic async nesting
        Ok(PoolDrive {
            name: base_device,
            mount_point: path.to_string(),
-            temperature_celsius: temperature,
+            temperature_celsius: None,
        })
    }

--- a/dashboard/Cargo.toml
+++ b/dashboard/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cm-dashboard"
-version = "0.1.222"
+version = "0.1.226"
 edition = "2021"

 [dependencies]
--- a/shared/Cargo.toml
+++ b/shared/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cm-dashboard-shared"
-version = "0.1.222"
+version = "0.1.226"
 edition = "2021"

 [dependencies]
Author	SHA1	Message	Date
Christoffer Martinsson	c62c7fa698	Remove debug logging from disk collector All checks were successful Build and Release / build-and-release (push) Successful in 1m11s Details Removed all debug! statements from disk collector to reduce log noise. Bump version to v0.1.226	2025-11-30 00:44:38 +01:00
Christoffer Martinsson	0b1d8c0a73	Fix Data_3 showing as unknown by handling smartctl warning exit codes All checks were successful Build and Release / build-and-release (push) Successful in 1m11s Details Root cause: sda's temperature exceeded threshold in the past, causing smartctl to return exit code 32 (warning: "Attributes have been <= threshold in the past"). The agent checked output.status.success() and rejected the entire output as failed, even though the data (serial, temperature, health) was perfectly valid. Smartctl exit codes are bit flags for informational warnings: - Exit 0: No warnings - Exit 32 (bit 5): Attributes were at/below threshold in past - Exit 64 (bit 6): Error log has entries - etc. The output data is valid regardless of these warning flags. Solution: Parse output as long as it's not empty, ignore exit code. Only return UNKNOWN if output is actually empty (command truly failed). Result: Data_3 will now show "ZDZ4VE0B T: 31°C" instead of "? Data_3: sda" Bump version to v0.1.225	2025-11-30 00:35:19 +01:00
Christoffer Martinsson	c77aa6eaaa	Fix Data_3 timeout by removing sequential SMART during pool detection All checks were successful Build and Release / build-and-release (push) Successful in 1m34s Details Root cause: SMART data was collected TWICE: 1. Sequential collection during pool detection in get_drive_info_for_path() using problematic tokio::task::block_in_place() nesting 2. Parallel collection in get_smart_data_for_drives() (v0.1.223) The sequential collection happened FIRST during pool detection, causing sda (Data_3) to timeout due to: - Bad async nesting: block_in_place() wrapping block_on() - Sequential execution causing runtime issues - sda being third in sequence, runtime degraded by then Solution: Remove SMART collection from get_drive_info_for_path(). Pool drive temperatures are populated later from the parallel SMART collection which properly uses futures::join_all. Benefits: - Eliminates problematic async nesting - All SMART queries happen once in parallel only - sda/Data_3 should now show serial (ZDZ4VE0B) and temperature Bump version to v0.1.224	2025-11-30 00:14:25 +01:00
Christoffer Martinsson	8a0e68f0e3	Fix Data_3 timeout by parallelizing SMART collection All checks were successful Build and Release / build-and-release (push) Successful in 1m10s Details Root cause: SMART data was collected sequentially, one drive at a time. With 5 drives taking ~500ms each, total collection time was 2.5+ seconds. When disk collector runs every 1 second, this caused overlapping collections creating resource contention. The last drive (sda/Data_3) would timeout due to the drive being accessed by the previous collection. Solution: Query all drives in parallel using futures::join_all. Now all drives get their SMART data collected simultaneously with independent 3-second timeouts, eliminating contention and reducing total collection time from 2.5+ seconds to ~500ms (the slowest single drive). Benefits: - All drives complete in ~500ms instead of 2.5+ seconds - No overlapping collections causing resource contention - Each drive gets full 3-second timeout window - sda/Data_3 should now show temperature and serial number Bump version to v0.1.223	2025-11-29 23:51:43 +01:00