All checks were successful
Build and Release / build-and-release (push) Successful in 1m11s
- Fix /tmp usage status to use proper thresholds instead of hardcoded Ok status - Fix wear level status to use configurable thresholds instead of hardcoded values - Add dedicated tmp_status field to SystemWidget for proper /tmp status display - Remove host-level hourglass icon during service operations - Implement immediate service status updates after start/stop/restart commands - Remove active users display and collection from NixOS section - Fix immediate host status aggregation transmission to dashboard
597 lines
23 KiB
Rust
597 lines
23 KiB
Rust
use anyhow::Result;
|
|
use async_trait::async_trait;
|
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
|
|
|
use crate::config::DiskConfig;
|
|
use std::process::Command;
|
|
use std::time::Instant;
|
|
use tracing::debug;
|
|
|
|
use super::{Collector, CollectorError};
|
|
|
|
/// Information about a storage pool (mount point with underlying drives)
|
|
#[derive(Debug, Clone)]
|
|
struct StoragePool {
|
|
name: String, // e.g., "steampool", "root"
|
|
mount_point: String, // e.g., "/mnt/steampool", "/"
|
|
filesystem: String, // e.g., "mergerfs", "ext4", "zfs", "btrfs"
|
|
storage_type: String, // e.g., "mergerfs", "single", "raid", "zfs"
|
|
size: String, // e.g., "2.5TB"
|
|
used: String, // e.g., "2.1TB"
|
|
available: String, // e.g., "400GB"
|
|
usage_percent: f32, // e.g., 85.0
|
|
underlying_drives: Vec<DriveInfo>, // Individual physical drives
|
|
}
|
|
|
|
/// Information about an individual physical drive
|
|
#[derive(Debug, Clone)]
|
|
struct DriveInfo {
|
|
device: String, // e.g., "sda", "nvme0n1"
|
|
health_status: String, // e.g., "PASSED", "FAILED"
|
|
temperature: Option<f32>, // e.g., 45.0°C
|
|
wear_level: Option<f32>, // e.g., 12.0% (for SSDs)
|
|
}
|
|
|
|
/// Disk usage collector for monitoring filesystem sizes
|
|
pub struct DiskCollector {
|
|
config: DiskConfig,
|
|
temperature_thresholds: HysteresisThresholds,
|
|
detected_devices: std::collections::HashMap<String, Vec<String>>, // mount_point -> devices
|
|
}
|
|
|
|
impl DiskCollector {
|
|
pub fn new(config: DiskConfig) -> Self {
|
|
// Create hysteresis thresholds for disk temperature from config
|
|
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
|
config.temperature_warning_celsius,
|
|
5.0, // 5°C gap for recovery
|
|
config.temperature_critical_celsius,
|
|
5.0, // 5°C gap for recovery
|
|
);
|
|
|
|
// Detect devices for all configured filesystems at startup
|
|
let mut detected_devices = std::collections::HashMap::new();
|
|
for fs_config in &config.filesystems {
|
|
if fs_config.monitor {
|
|
if let Ok(devices) = Self::detect_device_for_mount_point_static(&fs_config.mount_point) {
|
|
detected_devices.insert(fs_config.mount_point.clone(), devices);
|
|
}
|
|
}
|
|
}
|
|
|
|
Self {
|
|
config,
|
|
temperature_thresholds,
|
|
detected_devices,
|
|
}
|
|
}
|
|
|
|
/// Calculate disk temperature status using hysteresis thresholds
|
|
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
|
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
|
}
|
|
|
|
|
|
/// Get configured storage pools with individual drive information
|
|
fn get_configured_storage_pools(&self) -> Result<Vec<StoragePool>> {
|
|
let mut storage_pools = Vec::new();
|
|
|
|
for fs_config in &self.config.filesystems {
|
|
if !fs_config.monitor {
|
|
continue;
|
|
}
|
|
|
|
// Get filesystem stats for the mount point
|
|
match self.get_filesystem_info(&fs_config.mount_point) {
|
|
Ok((total_bytes, used_bytes)) => {
|
|
let available_bytes = total_bytes - used_bytes;
|
|
let usage_percent = if total_bytes > 0 {
|
|
(used_bytes as f64 / total_bytes as f64) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
// Convert bytes to human-readable format
|
|
let size = self.bytes_to_human_readable(total_bytes);
|
|
let used = self.bytes_to_human_readable(used_bytes);
|
|
let available = self.bytes_to_human_readable(available_bytes);
|
|
|
|
// Get individual drive information using pre-detected devices
|
|
let device_names = self.detected_devices.get(&fs_config.mount_point).cloned().unwrap_or_default();
|
|
let underlying_drives = self.get_drive_info_for_devices(&device_names)?;
|
|
|
|
storage_pools.push(StoragePool {
|
|
name: fs_config.name.clone(),
|
|
mount_point: fs_config.mount_point.clone(),
|
|
filesystem: fs_config.fs_type.clone(),
|
|
storage_type: fs_config.storage_type.clone(),
|
|
size,
|
|
used,
|
|
available,
|
|
usage_percent: usage_percent as f32,
|
|
underlying_drives,
|
|
});
|
|
|
|
debug!(
|
|
"Storage pool '{}' ({}) at {} with {} detected drives",
|
|
fs_config.name, fs_config.storage_type, fs_config.mount_point, device_names.len()
|
|
);
|
|
}
|
|
Err(e) => {
|
|
debug!(
|
|
"Failed to get filesystem info for storage pool '{}': {}",
|
|
fs_config.name, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(storage_pools)
|
|
}
|
|
|
|
/// Get drive information for a list of device names
|
|
fn get_drive_info_for_devices(&self, device_names: &[String]) -> Result<Vec<DriveInfo>> {
|
|
let mut drives = Vec::new();
|
|
|
|
for device_name in device_names {
|
|
let device_path = format!("/dev/{}", device_name);
|
|
|
|
// Get SMART data for this drive
|
|
let (health_status, temperature, wear_level) = self.get_smart_data(&device_path);
|
|
|
|
drives.push(DriveInfo {
|
|
device: device_name.clone(),
|
|
health_status: health_status.clone(),
|
|
temperature,
|
|
wear_level,
|
|
});
|
|
|
|
debug!(
|
|
"Drive info for {}: health={}, temp={:?}°C, wear={:?}%",
|
|
device_name, health_status, temperature, wear_level
|
|
);
|
|
}
|
|
|
|
Ok(drives)
|
|
}
|
|
|
|
/// Get SMART data for a drive (health, temperature, wear level)
|
|
fn get_smart_data(&self, device_path: &str) -> (String, Option<f32>, Option<f32>) {
|
|
// Try to get SMART data using smartctl
|
|
let output = Command::new("sudo")
|
|
.arg("smartctl")
|
|
.arg("-a")
|
|
.arg(device_path)
|
|
.output();
|
|
|
|
match output {
|
|
Ok(result) if result.status.success() => {
|
|
let stdout = String::from_utf8_lossy(&result.stdout);
|
|
|
|
// Parse health status
|
|
let health = if stdout.contains("PASSED") {
|
|
"PASSED".to_string()
|
|
} else if stdout.contains("FAILED") {
|
|
"FAILED".to_string()
|
|
} else {
|
|
"UNKNOWN".to_string()
|
|
};
|
|
|
|
// Parse temperature (look for various temperature indicators)
|
|
let temperature = self.parse_temperature_from_smart(&stdout);
|
|
|
|
// Parse wear level (for SSDs)
|
|
let wear_level = self.parse_wear_level_from_smart(&stdout);
|
|
|
|
(health, temperature, wear_level)
|
|
}
|
|
_ => {
|
|
debug!("Failed to get SMART data for {}", device_path);
|
|
("UNKNOWN".to_string(), None, None)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Parse temperature from SMART output
|
|
fn parse_temperature_from_smart(&self, smart_output: &str) -> Option<f32> {
|
|
for line in smart_output.lines() {
|
|
// Look for temperature in various formats
|
|
if line.contains("Temperature_Celsius") || line.contains("Temperature") {
|
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
if parts.len() >= 10 {
|
|
if let Ok(temp) = parts[9].parse::<f32>() {
|
|
return Some(temp);
|
|
}
|
|
}
|
|
}
|
|
// NVMe drives might show temperature differently
|
|
if line.contains("temperature:") {
|
|
if let Some(temp_part) = line.split("temperature:").nth(1) {
|
|
if let Some(temp_str) = temp_part.split_whitespace().next() {
|
|
if let Ok(temp) = temp_str.parse::<f32>() {
|
|
return Some(temp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Parse wear level from SMART output (SSD wear leveling)
|
|
/// Supports both NVMe and SATA SSD wear indicators
|
|
fn parse_wear_level_from_smart(&self, smart_output: &str) -> Option<f32> {
|
|
for line in smart_output.lines() {
|
|
let line = line.trim();
|
|
|
|
// NVMe drives - direct percentage used
|
|
if line.contains("Percentage Used:") {
|
|
if let Some(wear_part) = line.split("Percentage Used:").nth(1) {
|
|
if let Some(wear_str) = wear_part.split('%').next() {
|
|
if let Ok(wear) = wear_str.trim().parse::<f32>() {
|
|
return Some(wear);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// SATA SSD attributes - parse SMART table format
|
|
// Format: ID ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
if parts.len() >= 10 {
|
|
// SSD Life Left / Percent Lifetime Remaining (higher = less wear)
|
|
if line.contains("SSD_Life_Left") || line.contains("Percent_Lifetime_Remain") {
|
|
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
|
return Some(100.0 - remaining); // Convert remaining to used
|
|
}
|
|
}
|
|
|
|
// Media Wearout Indicator (lower = more wear, normalize to 0-100)
|
|
if line.contains("Media_Wearout_Indicator") {
|
|
if let Ok(remaining) = parts[3].parse::<f32>() { // VALUE column
|
|
return Some(100.0 - remaining); // Convert remaining to used
|
|
}
|
|
}
|
|
|
|
// Wear Leveling Count (higher = less wear, but varies by manufacturer)
|
|
if line.contains("Wear_Leveling_Count") {
|
|
if let Ok(wear_count) = parts[3].parse::<f32>() { // VALUE column
|
|
// Most SSDs: 100 = new, decreases with wear
|
|
if wear_count <= 100.0 {
|
|
return Some(100.0 - wear_count);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Total LBAs Written - calculate against typical endurance if available
|
|
// This is more complex and manufacturer-specific, so we skip for now
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Convert bytes to human-readable format
|
|
fn bytes_to_human_readable(&self, bytes: u64) -> String {
|
|
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
|
|
let mut size = bytes as f64;
|
|
let mut unit_index = 0;
|
|
|
|
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
|
|
size /= 1024.0;
|
|
unit_index += 1;
|
|
}
|
|
|
|
if unit_index == 0 {
|
|
format!("{:.0}{}", size, UNITS[unit_index])
|
|
} else {
|
|
format!("{:.1}{}", size, UNITS[unit_index])
|
|
}
|
|
}
|
|
|
|
/// Detect device backing a mount point using lsblk (static version for startup)
|
|
fn detect_device_for_mount_point_static(mount_point: &str) -> Result<Vec<String>> {
|
|
let output = Command::new("lsblk")
|
|
.args(&["-n", "-o", "NAME,MOUNTPOINT"])
|
|
.output()?;
|
|
|
|
if !output.status.success() {
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
|
|
|
for line in output_str.lines() {
|
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
if parts.len() >= 2 && parts[1] == mount_point {
|
|
// Remove tree symbols and extract device name (e.g., "├─nvme0n1p2" -> "nvme0n1p2")
|
|
let device_name = parts[0]
|
|
.trim_start_matches('├')
|
|
.trim_start_matches('└')
|
|
.trim_start_matches('─')
|
|
.trim();
|
|
|
|
// Extract base device name (e.g., "nvme0n1p2" -> "nvme0n1")
|
|
if let Some(base_device) = Self::extract_base_device(device_name) {
|
|
return Ok(vec![base_device]);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(Vec::new())
|
|
}
|
|
|
|
/// Extract base device name from partition (e.g., "nvme0n1p2" -> "nvme0n1", "sda1" -> "sda")
|
|
fn extract_base_device(device_name: &str) -> Option<String> {
|
|
// Handle NVMe devices (nvme0n1p1 -> nvme0n1)
|
|
if device_name.starts_with("nvme") {
|
|
if let Some(p_pos) = device_name.find('p') {
|
|
return Some(device_name[..p_pos].to_string());
|
|
}
|
|
}
|
|
|
|
// Handle traditional devices (sda1 -> sda)
|
|
if device_name.len() > 1 {
|
|
let chars: Vec<char> = device_name.chars().collect();
|
|
let mut end_idx = chars.len();
|
|
|
|
// Find where the device name ends and partition number begins
|
|
for (i, &c) in chars.iter().enumerate().rev() {
|
|
if !c.is_ascii_digit() {
|
|
end_idx = i + 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if end_idx > 0 && end_idx < chars.len() {
|
|
return Some(chars[..end_idx].iter().collect());
|
|
}
|
|
}
|
|
|
|
// If no partition detected, return as-is
|
|
Some(device_name.to_string())
|
|
}
|
|
|
|
|
|
/// Get filesystem info using df command
|
|
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
|
|
let output = Command::new("df")
|
|
.arg("--block-size=1")
|
|
.arg(path)
|
|
.output()?;
|
|
|
|
if !output.status.success() {
|
|
return Err(anyhow::anyhow!("df command failed for {}", path));
|
|
}
|
|
|
|
let output_str = String::from_utf8(output.stdout)?;
|
|
let lines: Vec<&str> = output_str.lines().collect();
|
|
|
|
if lines.len() < 2 {
|
|
return Err(anyhow::anyhow!("Unexpected df output format"));
|
|
}
|
|
|
|
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
|
if fields.len() < 4 {
|
|
return Err(anyhow::anyhow!("Unexpected df fields count"));
|
|
}
|
|
|
|
let total_bytes = fields[1].parse::<u64>()?;
|
|
let used_bytes = fields[2].parse::<u64>()?;
|
|
|
|
Ok((total_bytes, used_bytes))
|
|
}
|
|
|
|
|
|
/// Parse size string (e.g., "120G", "45M") to GB value
|
|
fn parse_size_to_gb(&self, size_str: &str) -> f32 {
|
|
let size_str = size_str.trim();
|
|
if size_str.is_empty() || size_str == "-" {
|
|
return 0.0;
|
|
}
|
|
|
|
// Extract numeric part and unit
|
|
let (num_str, unit) = if let Some(last_char) = size_str.chars().last() {
|
|
if last_char.is_alphabetic() {
|
|
let num_part = &size_str[..size_str.len() - 1];
|
|
let unit_part = &size_str[size_str.len() - 1..];
|
|
(num_part, unit_part)
|
|
} else {
|
|
(size_str, "")
|
|
}
|
|
} else {
|
|
(size_str, "")
|
|
};
|
|
|
|
let number: f32 = num_str.parse().unwrap_or(0.0);
|
|
|
|
match unit.to_uppercase().as_str() {
|
|
"T" | "TB" => number * 1024.0,
|
|
"G" | "GB" => number,
|
|
"M" | "MB" => number / 1024.0,
|
|
"K" | "KB" => number / (1024.0 * 1024.0),
|
|
"B" | "" => number / (1024.0 * 1024.0 * 1024.0),
|
|
_ => number, // Assume GB if unknown unit
|
|
}
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Collector for DiskCollector {
|
|
|
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
|
let start_time = Instant::now();
|
|
debug!("Collecting storage pool and individual drive metrics");
|
|
|
|
let mut metrics = Vec::new();
|
|
|
|
// Get configured storage pools with individual drive data
|
|
let storage_pools = match self.get_configured_storage_pools() {
|
|
Ok(pools) => {
|
|
debug!("Found {} storage pools", pools.len());
|
|
pools
|
|
}
|
|
Err(e) => {
|
|
debug!("Failed to get storage pools: {}", e);
|
|
Vec::new()
|
|
}
|
|
};
|
|
|
|
// Generate metrics for each storage pool and its underlying drives
|
|
for storage_pool in &storage_pools {
|
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
|
|
// Storage pool overall metrics
|
|
let pool_name = &storage_pool.name;
|
|
|
|
// Parse size strings to get actual values for calculations
|
|
let size_gb = self.parse_size_to_gb(&storage_pool.size);
|
|
let used_gb = self.parse_size_to_gb(&storage_pool.used);
|
|
let avail_gb = self.parse_size_to_gb(&storage_pool.available);
|
|
|
|
// Calculate status based on configured thresholds
|
|
let pool_status = if storage_pool.usage_percent >= self.config.usage_critical_percent {
|
|
Status::Critical
|
|
} else if storage_pool.usage_percent >= self.config.usage_warning_percent {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
};
|
|
|
|
// Storage pool info metrics
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_mount_point", pool_name),
|
|
value: MetricValue::String(storage_pool.mount_point.clone()),
|
|
unit: None,
|
|
description: Some(format!("Mount: {}", storage_pool.mount_point)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_filesystem", pool_name),
|
|
value: MetricValue::String(storage_pool.filesystem.clone()),
|
|
unit: None,
|
|
description: Some(format!("FS: {}", storage_pool.filesystem)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_storage_type", pool_name),
|
|
value: MetricValue::String(storage_pool.storage_type.clone()),
|
|
unit: None,
|
|
description: Some(format!("Type: {}", storage_pool.storage_type)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
// Storage pool size metrics
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_total_gb", pool_name),
|
|
value: MetricValue::Float(size_gb),
|
|
unit: Some("GB".to_string()),
|
|
description: Some(format!("Total: {}", storage_pool.size)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_used_gb", pool_name),
|
|
value: MetricValue::Float(used_gb),
|
|
unit: Some("GB".to_string()),
|
|
description: Some(format!("Used: {}", storage_pool.used)),
|
|
status: pool_status,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_available_gb", pool_name),
|
|
value: MetricValue::Float(avail_gb),
|
|
unit: Some("GB".to_string()),
|
|
description: Some(format!("Available: {}", storage_pool.available)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_usage_percent", pool_name),
|
|
value: MetricValue::Float(storage_pool.usage_percent),
|
|
unit: Some("%".to_string()),
|
|
description: Some(format!("Usage: {:.1}%", storage_pool.usage_percent)),
|
|
status: pool_status,
|
|
timestamp,
|
|
});
|
|
|
|
// Individual drive metrics for this storage pool
|
|
for drive in &storage_pool.underlying_drives {
|
|
// Drive health status
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_{}_health", pool_name, drive.device),
|
|
value: MetricValue::String(drive.health_status.clone()),
|
|
unit: None,
|
|
description: Some(format!("{}: {}", drive.device, drive.health_status)),
|
|
status: if drive.health_status == "PASSED" { Status::Ok }
|
|
else if drive.health_status == "FAILED" { Status::Critical }
|
|
else { Status::Unknown },
|
|
timestamp,
|
|
});
|
|
|
|
// Drive temperature
|
|
if let Some(temp) = drive.temperature {
|
|
let temp_status = self.calculate_temperature_status(
|
|
&format!("disk_{}_{}_temperature", pool_name, drive.device),
|
|
temp,
|
|
status_tracker
|
|
);
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_{}_temperature", pool_name, drive.device),
|
|
value: MetricValue::Float(temp),
|
|
unit: Some("°C".to_string()),
|
|
description: Some(format!("{}: {:.0}°C", drive.device, temp)),
|
|
status: temp_status,
|
|
timestamp,
|
|
});
|
|
}
|
|
|
|
// Drive wear level (for SSDs)
|
|
if let Some(wear) = drive.wear_level {
|
|
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
|
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
|
else { Status::Ok };
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_{}_wear_percent", pool_name, drive.device),
|
|
value: MetricValue::Float(wear),
|
|
unit: Some("%".to_string()),
|
|
description: Some(format!("{}: {:.0}% wear", drive.device, wear)),
|
|
status: wear_status,
|
|
timestamp,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add storage pool count metric
|
|
metrics.push(Metric {
|
|
name: "disk_count".to_string(),
|
|
value: MetricValue::Integer(storage_pools.len() as i64),
|
|
unit: None,
|
|
description: Some(format!("Total storage pools: {}", storage_pools.len())),
|
|
status: Status::Ok,
|
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
});
|
|
|
|
|
|
let collection_time = start_time.elapsed();
|
|
debug!(
|
|
"Multi-disk collection completed in {:?} with {} metrics",
|
|
collection_time,
|
|
metrics.len()
|
|
);
|
|
|
|
Ok(metrics)
|
|
}
|
|
|
|
}
|