use anyhow::Result; use async_trait::async_trait; use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds, Status}; use crate::config::DiskConfig; use std::process::Command; use std::time::Instant; use std::collections::HashMap; use tracing::debug; use super::{Collector, CollectorError}; /// Storage collector with clean architecture and structured data output pub struct DiskCollector { config: DiskConfig, temperature_thresholds: HysteresisThresholds, } /// A physical drive with its filesystems #[derive(Debug, Clone)] struct PhysicalDrive { name: String, // e.g., "nvme0n1", "sda" health: String, // SMART health status temperature_celsius: Option, // Drive temperature wear_percent: Option, // SSD wear level filesystems: Vec, // mounted filesystems on this drive } /// A filesystem mounted on a drive #[derive(Debug, Clone)] struct Filesystem { mount_point: String, // e.g., "/", "/boot" usage_percent: f32, // Usage percentage used_bytes: u64, // Used bytes total_bytes: u64, // Total bytes } /// MergerFS pool #[derive(Debug, Clone)] struct MergerfsPool { name: String, // e.g., "srv_media" mount_point: String, // e.g., "/srv/media" total_bytes: u64, // Pool total bytes used_bytes: u64, // Pool used bytes data_drives: Vec, // Data drives in pool parity_drives: Vec, // Parity drives in pool } /// Drive in a storage pool #[derive(Debug, Clone)] struct PoolDrive { name: String, // Drive name temperature_celsius: Option, // Drive temperature } impl DiskCollector { pub fn new(config: DiskConfig) -> Self { let temperature_thresholds = HysteresisThresholds::new( config.temperature_warning_celsius, config.temperature_critical_celsius, ); Self { config, temperature_thresholds, } } /// Collect all storage data and populate AgentData async fn collect_storage_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { let start_time = Instant::now(); debug!("Starting clean storage collection"); // Step 1: Get mount points and their backing devices let mount_devices = self.get_mount_devices().await?; // Step 2: Get filesystem usage for each mount point using df let filesystem_usage = self.get_filesystem_usage(&mount_devices).map_err(|e| CollectorError::Parse { value: "filesystem usage".to_string(), error: format!("Failed to get filesystem usage: {}", e), })?; // Step 3: Detect MergerFS pools let mergerfs_pools = self.detect_mergerfs_pools(&filesystem_usage).map_err(|e| CollectorError::Parse { value: "mergerfs pools".to_string(), error: format!("Failed to detect mergerfs pools: {}", e), })?; // Step 4: Group filesystems by physical drive (excluding mergerfs members) let physical_drives = self.group_by_physical_drive(&mount_devices, &filesystem_usage, &mergerfs_pools).map_err(|e| CollectorError::Parse { value: "physical drives".to_string(), error: format!("Failed to group by physical drive: {}", e), })?; // Step 5: Get SMART data for all drives let smart_data = self.get_smart_data_for_drives(&physical_drives, &mergerfs_pools).await; // Step 6: Populate AgentData self.populate_drives_data(&physical_drives, &smart_data, agent_data)?; self.populate_pools_data(&mergerfs_pools, &smart_data, agent_data)?; let elapsed = start_time.elapsed(); debug!("Storage collection completed in {:?}", elapsed); Ok(()) } /// Get block devices and their mount points using lsblk async fn get_mount_devices(&self) -> Result, CollectorError> { let output = Command::new("lsblk") .args(&["-rn", "-o", "NAME,MOUNTPOINT"]) .output() .map_err(|e| CollectorError::SystemRead { path: "block devices".to_string(), error: e.to_string(), })?; let mut mount_devices = HashMap::new(); for line in String::from_utf8_lossy(&output.stdout).lines() { let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 { let device_name = parts[0]; let mount_point = parts[1]; // Skip swap partitions and unmounted devices if mount_point == "[SWAP]" || mount_point.is_empty() { continue; } // Convert device name to full path let device_path = format!("/dev/{}", device_name); mount_devices.insert(mount_point.to_string(), device_path); } } debug!("Found {} mounted block devices", mount_devices.len()); Ok(mount_devices) } /// Use df to get filesystem usage for mount points fn get_filesystem_usage(&self, mount_devices: &HashMap) -> anyhow::Result> { let mut filesystem_usage = HashMap::new(); for mount_point in mount_devices.keys() { match self.get_filesystem_info(mount_point) { Ok((total, used)) => { filesystem_usage.insert(mount_point.clone(), (total, used)); } Err(e) => { debug!("Failed to get filesystem info for {}: {}", mount_point, e); } } } Ok(filesystem_usage) } /// Get filesystem info for a single mount point fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> { let output = Command::new("df") .args(&["--block-size=1", mount_point]) .output() .map_err(|e| CollectorError::SystemRead { path: format!("df {}", mount_point), error: e.to_string(), })?; let output_str = String::from_utf8_lossy(&output.stdout); let lines: Vec<&str> = output_str.lines().collect(); if lines.len() < 2 { return Err(CollectorError::Parse { value: output_str.to_string(), error: "Expected at least 2 lines from df output".to_string(), }); } // Parse the data line (skip header) let parts: Vec<&str> = lines[1].split_whitespace().collect(); if parts.len() < 4 { return Err(CollectorError::Parse { value: lines[1].to_string(), error: "Expected at least 4 fields in df output".to_string(), }); } let total_bytes: u64 = parts[1].parse().map_err(|e| CollectorError::Parse { value: parts[1].to_string(), error: format!("Failed to parse total bytes: {}", e), })?; let used_bytes: u64 = parts[2].parse().map_err(|e| CollectorError::Parse { value: parts[2].to_string(), error: format!("Failed to parse used bytes: {}", e), })?; Ok((total_bytes, used_bytes)) } /// Detect MergerFS pools from mount data fn detect_mergerfs_pools(&self, _filesystem_usage: &HashMap) -> anyhow::Result> { let pools = Vec::new(); // For now, return empty pools - full mergerfs detection would require parsing /proc/mounts for fuse.mergerfs // This ensures we don't break existing functionality Ok(pools) } /// Group filesystems by physical drive (excluding mergerfs members) fn group_by_physical_drive( &self, mount_devices: &HashMap, filesystem_usage: &HashMap, mergerfs_pools: &[MergerfsPool] ) -> anyhow::Result> { let mut drive_groups: HashMap> = HashMap::new(); // Get all mergerfs member paths to exclude them let mut mergerfs_members = std::collections::HashSet::new(); for pool in mergerfs_pools { for drive in &pool.data_drives { mergerfs_members.insert(drive.name.clone()); } for drive in &pool.parity_drives { mergerfs_members.insert(drive.name.clone()); } } // Group filesystems by base device for (mount_point, device) in mount_devices { // Skip mergerfs member mounts if mergerfs_members.contains(mount_point) { continue; } let base_device = self.extract_base_device(device); if let Some((total, used)) = filesystem_usage.get(mount_point) { let usage_percent = (*used as f32 / *total as f32) * 100.0; let filesystem = Filesystem { mount_point: mount_point.clone(), // Keep actual mount point like "/" and "/boot" usage_percent, used_bytes: *used, total_bytes: *total, }; drive_groups.entry(base_device).or_insert_with(Vec::new).push(filesystem); } } // Convert to PhysicalDrive structs let mut physical_drives = Vec::new(); for (drive_name, filesystems) in drive_groups { let physical_drive = PhysicalDrive { name: drive_name, health: "UNKNOWN".to_string(), // Will be updated with SMART data temperature_celsius: None, wear_percent: None, filesystems, }; physical_drives.push(physical_drive); } physical_drives.sort_by(|a, b| a.name.cmp(&b.name)); Ok(physical_drives) } /// Extract base device name from device path fn extract_base_device(&self, device: &str) -> String { // Extract base device name (e.g., "/dev/nvme0n1p1" -> "nvme0n1") if let Some(dev_name) = device.strip_prefix("/dev/") { // Remove partition numbers: nvme0n1p1 -> nvme0n1, sda1 -> sda if let Some(pos) = dev_name.find('p') { if dev_name[pos+1..].chars().all(char::is_numeric) { return dev_name[..pos].to_string(); } } // Handle traditional naming: sda1 -> sda let mut result = String::new(); for ch in dev_name.chars() { if ch.is_ascii_digit() { break; } result.push(ch); } if !result.is_empty() { return result; } } device.to_string() } /// Get SMART data for drives async fn get_smart_data_for_drives(&self, physical_drives: &[PhysicalDrive], mergerfs_pools: &[MergerfsPool]) -> HashMap { let mut smart_data = HashMap::new(); // Collect all drive names let mut all_drives = std::collections::HashSet::new(); for drive in physical_drives { all_drives.insert(drive.name.clone()); } for pool in mergerfs_pools { for drive in &pool.data_drives { all_drives.insert(drive.name.clone()); } for drive in &pool.parity_drives { all_drives.insert(drive.name.clone()); } } // Get SMART data for each drive for drive_name in all_drives { if let Ok(data) = self.get_smart_data(&drive_name).await { smart_data.insert(drive_name, data); } } smart_data } /// Get SMART data for a single drive async fn get_smart_data(&self, drive_name: &str) -> Result { let output = Command::new("sudo") .args(&["smartctl", "-a", &format!("/dev/{}", drive_name)]) .output() .map_err(|e| CollectorError::SystemRead { path: format!("SMART data for {}", drive_name), error: e.to_string(), })?; let output_str = String::from_utf8_lossy(&output.stdout); let error_str = String::from_utf8_lossy(&output.stderr); // Debug logging for SMART command results debug!("SMART output for {}: status={}, stdout_len={}, stderr={}", drive_name, output.status, output_str.len(), error_str); if !output.status.success() { debug!("SMART command failed for {}: {}", drive_name, error_str); // Return unknown data rather than failing completely return Ok(SmartData { health: "UNKNOWN".to_string(), temperature_celsius: None, wear_percent: None, }); } let mut health = "UNKNOWN".to_string(); let mut temperature = None; let mut wear_percent = None; for line in output_str.lines() { if line.contains("SMART overall-health") { if line.contains("PASSED") { health = "PASSED".to_string(); } else if line.contains("FAILED") { health = "FAILED".to_string(); } } // Temperature parsing for different drive types if line.contains("Temperature_Celsius") || line.contains("Airflow_Temperature_Cel") { // Traditional SATA drives: attribute table format if let Some(temp_str) = line.split_whitespace().nth(9) { if let Ok(temp) = temp_str.parse::() { temperature = Some(temp); } } } else if line.starts_with("Temperature:") { // NVMe drives: simple "Temperature: 27 Celsius" format let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 { if let Ok(temp) = parts[1].parse::() { temperature = Some(temp); } } } // Wear level parsing for SSDs if line.contains("Wear_Leveling_Count") || line.contains("SSD_Life_Left") { if let Some(wear_str) = line.split_whitespace().nth(9) { if let Ok(wear) = wear_str.parse::() { wear_percent = Some(100.0 - wear); // Convert remaining life to wear } } } // NVMe wear parsing: "Percentage Used: 1%" if line.contains("Percentage Used:") { if let Some(percent_part) = line.split("Percentage Used:").nth(1) { if let Some(percent_str) = percent_part.split_whitespace().next() { if let Some(percent_clean) = percent_str.strip_suffix('%') { if let Ok(wear) = percent_clean.parse::() { wear_percent = Some(wear); } } } } } } Ok(SmartData { health, temperature_celsius: temperature, wear_percent, }) } /// Populate drives data into AgentData fn populate_drives_data(&self, physical_drives: &[PhysicalDrive], smart_data: &HashMap, agent_data: &mut AgentData) -> Result<(), CollectorError> { for drive in physical_drives { let smart = smart_data.get(&drive.name); let mut filesystems: Vec = drive.filesystems.iter().map(|fs| { FilesystemData { mount: fs.mount_point.clone(), // This preserves "/" and "/boot" correctly usage_percent: fs.usage_percent, used_gb: fs.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0), total_gb: fs.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0), usage_status: self.calculate_filesystem_usage_status(fs.usage_percent), } }).collect(); // Sort filesystems by mount point for consistent display order filesystems.sort_by(|a, b| a.mount.cmp(&b.mount)); agent_data.system.storage.drives.push(DriveData { name: drive.name.clone(), health: smart.map(|s| s.health.clone()).unwrap_or_else(|| drive.health.clone()), temperature_celsius: smart.and_then(|s| s.temperature_celsius), wear_percent: smart.and_then(|s| s.wear_percent), filesystems, temperature_status: smart.and_then(|s| s.temperature_celsius) .map(|temp| self.calculate_temperature_status(temp)) .unwrap_or(Status::Unknown), health_status: self.calculate_health_status( smart.map(|s| s.health.as_str()).unwrap_or("UNKNOWN") ), }); } Ok(()) } /// Populate pools data into AgentData fn populate_pools_data(&self, mergerfs_pools: &[MergerfsPool], _smart_data: &HashMap, agent_data: &mut AgentData) -> Result<(), CollectorError> { for pool in mergerfs_pools { let pool_data = PoolData { name: pool.name.clone(), mount: pool.mount_point.clone(), pool_type: "mergerfs".to_string(), health: "healthy".to_string(), // TODO: Calculate based on member drives usage_percent: (pool.used_bytes as f32 / pool.total_bytes as f32) * 100.0, used_gb: pool.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0), total_gb: pool.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0), data_drives: pool.data_drives.iter().map(|d| cm_dashboard_shared::PoolDriveData { name: d.name.clone(), temperature_celsius: d.temperature_celsius, health: "unknown".to_string(), wear_percent: None, }).collect(), parity_drives: pool.parity_drives.iter().map(|d| cm_dashboard_shared::PoolDriveData { name: d.name.clone(), temperature_celsius: d.temperature_celsius, health: "unknown".to_string(), wear_percent: None, }).collect(), }; agent_data.system.storage.pools.push(pool_data); } Ok(()) } /// Calculate filesystem usage status fn calculate_filesystem_usage_status(&self, usage_percent: f32) -> Status { // Use standard filesystem warning/critical thresholds if usage_percent >= 95.0 { Status::Critical } else if usage_percent >= 85.0 { Status::Warning } else { Status::Ok } } /// Calculate drive temperature status fn calculate_temperature_status(&self, temperature: f32) -> Status { self.temperature_thresholds.evaluate(temperature) } /// Calculate drive health status fn calculate_health_status(&self, health: &str) -> Status { match health { "PASSED" => Status::Ok, "FAILED" => Status::Critical, _ => Status::Unknown, } } } #[async_trait] impl Collector for DiskCollector { async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { self.collect_storage_data(agent_data).await } } /// SMART data for a drive #[derive(Debug, Clone)] struct SmartData { health: String, temperature_celsius: Option, wear_percent: Option, }