Implement UUID-based disk detection for CMTEC infrastructure

Replace df-based auto-discovery with UUID-based detection using NixOS
hardware configuration data. Each host now has predefined filesystem
configurations with predictable metric names.

- Add FilesystemConfig struct with UUID, mount point, and filesystem type
- Remove auto_discover and devices fields from DiskConfig
- Add host-specific UUID defaults for cmbox, srv01, srv02, simonbox, steambox
- Remove legacy get_mounted_disks() df-based detection method
- Update DiskCollector to use UUID resolution via /dev/disk/by-uuid/
- Generate predictable metric names: disk_root_*, disk_boot_*, etc.
- Maintain fallback for labbox/wslbox (no UUIDs configured yet)

Provides consistent metric names across reboots and reliable detection
aligned with NixOS deployments without dependency on mount order.
This commit is contained in:
2025-10-20 09:50:10 +02:00
parent f67779be9d
commit e7200fb1b0
3 changed files with 519 additions and 284 deletions

View File

@@ -2,6 +2,9 @@ use anyhow::Result;
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricValue, Status};
use crate::config::DiskConfig;
use std::fs;
use std::path::Path;
use std::process::Command;
use std::time::Instant;
use tracing::debug;
@@ -11,24 +14,136 @@ use super::{Collector, CollectorError, PerformanceMetrics};
/// Information about a mounted disk
#[derive(Debug, Clone)]
struct MountedDisk {
device: String, // e.g., "/dev/nvme0n1p1"
physical_device: String, // e.g., "/dev/nvme0n1"
mount_point: String, // e.g., "/"
filesystem: String, // e.g., "ext4"
device: String, // e.g., "/dev/nvme0n1p1"
physical_device: String, // e.g., "/dev/nvme0n1"
mount_point: String, // e.g., "/"
filesystem: String, // e.g., "ext4"
size: String, // e.g., "120G"
used: String, // e.g., "45G"
available: String, // e.g., "75G"
usage_percent: f32, // e.g., 38.5
config_name: Option<String>, // Name from config if UUID-based
}
/// Disk usage collector for monitoring filesystem sizes
pub struct DiskCollector {
// Immutable collector for caching compatibility
config: DiskConfig,
}
impl DiskCollector {
pub fn new() -> Self {
Self {}
pub fn new(config: DiskConfig) -> Self {
Self { config }
}
/// Resolve UUID to actual device path
fn resolve_uuid_to_device(&self, uuid: &str) -> Result<String> {
let uuid_path = format!("/dev/disk/by-uuid/{}", uuid);
if Path::new(&uuid_path).exists() {
match fs::read_link(&uuid_path) {
Ok(target) => {
// Convert relative path to absolute
if target.is_relative() {
let parent = Path::new(&uuid_path).parent().unwrap();
let resolved = parent.join(&target);
match resolved.canonicalize() {
Ok(canonical) => Ok(canonical.to_string_lossy().to_string()),
Err(_) => Ok(target.to_string_lossy().to_string()),
}
} else {
Ok(target.to_string_lossy().to_string())
}
}
Err(e) => Err(anyhow::anyhow!("Failed to resolve UUID {}: {}", uuid, e)),
}
} else {
Err(anyhow::anyhow!("UUID {} not found in /dev/disk/by-uuid/", uuid))
}
}
/// Get configured filesystems from UUIDs
fn get_configured_filesystems(&self) -> Result<Vec<MountedDisk>> {
let mut configured_disks = Vec::new();
for fs_config in &self.config.filesystems {
if !fs_config.monitor {
continue;
}
// Resolve UUID to device
match self.resolve_uuid_to_device(&fs_config.uuid) {
Ok(device_path) => {
// Get filesystem stats for the mount point
match self.get_filesystem_info(&fs_config.mount_point) {
Ok((total_bytes, used_bytes)) => {
let available_bytes = total_bytes - used_bytes;
let usage_percent = if total_bytes > 0 {
(used_bytes as f64 / total_bytes as f64) * 100.0
} else {
0.0
};
// Convert bytes to human-readable format
let size = self.bytes_to_human_readable(total_bytes);
let used = self.bytes_to_human_readable(used_bytes);
let available = self.bytes_to_human_readable(available_bytes);
// Get physical device for SMART monitoring
let physical_device = self.get_physical_device(&device_path)?;
configured_disks.push(MountedDisk {
device: device_path.clone(),
physical_device,
mount_point: fs_config.mount_point.clone(),
filesystem: fs_config.fs_type.clone(),
size,
used,
available,
usage_percent: usage_percent as f32,
config_name: Some(fs_config.name.clone()),
});
debug!(
"Configured filesystem '{}' (UUID: {}) mounted at {} using {}",
fs_config.name, fs_config.uuid, fs_config.mount_point, device_path
);
}
Err(e) => {
debug!(
"Failed to get filesystem info for configured filesystem '{}': {}",
fs_config.name, e
);
}
}
}
Err(e) => {
debug!(
"Failed to resolve UUID for configured filesystem '{}': {}",
fs_config.name, e
);
}
}
}
Ok(configured_disks)
}
/// Convert bytes to human-readable format
fn bytes_to_human_readable(&self, bytes: u64) -> String {
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
let mut size = bytes as f64;
let mut unit_index = 0;
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
size /= 1024.0;
unit_index += 1;
}
if unit_index == 0 {
format!("{:.0}{}", size, UNITS[unit_index])
} else {
format!("{:.1}{}", size, UNITS[unit_index])
}
}
/// Get directory size using du command (efficient for single directory)
@@ -42,9 +157,12 @@ impl DiskCollector {
// du returns success even with permission denied warnings in stderr
// We only care if the command completely failed or produced no stdout
let output_str = String::from_utf8(output.stdout)?;
if output_str.trim().is_empty() {
return Err(anyhow::anyhow!("du command produced no output for {}", path));
return Err(anyhow::anyhow!(
"du command produced no output for {}",
path
));
}
let size_str = output_str
@@ -69,7 +187,7 @@ impl DiskCollector {
let output_str = String::from_utf8(output.stdout)?;
let lines: Vec<&str> = output_str.lines().collect();
if lines.len() < 2 {
return Err(anyhow::anyhow!("Unexpected df output format"));
}
@@ -92,64 +210,6 @@ impl DiskCollector {
Ok((total_bytes, used_bytes, usage_percent as f32))
}
/// Get all mounted disks with their mount points and underlying devices
fn get_mounted_disks(&self) -> Result<Vec<MountedDisk>> {
let output = Command::new("df")
.arg("-h")
.arg("--output=source,target,fstype,size,used,avail,pcent")
.output()?;
if !output.status.success() {
return Err(anyhow::anyhow!("df command failed"));
}
let output_str = String::from_utf8(output.stdout)?;
let mut mounted_disks = Vec::new();
for line in output_str.lines().skip(1) { // Skip header
let fields: Vec<&str> = line.split_whitespace().collect();
if fields.len() >= 7 {
let source = fields[0];
let target = fields[1];
let fstype = fields[2];
let size = fields[3];
let used = fields[4];
let avail = fields[5];
let pcent_str = fields[6];
// Skip special filesystems
if source.starts_with("/dev/") &&
!fstype.contains("tmpfs") &&
!fstype.contains("devtmpfs") &&
!target.starts_with("/proc") &&
!target.starts_with("/sys") &&
!target.starts_with("/dev") {
// Extract percentage
let usage_percent = pcent_str
.trim_end_matches('%')
.parse::<f32>()
.unwrap_or(0.0);
// Get underlying physical device
let physical_device = self.get_physical_device(source)?;
mounted_disks.push(MountedDisk {
device: source.to_string(),
physical_device,
mount_point: target.to_string(),
filesystem: fstype.to_string(),
size: size.to_string(),
used: used.to_string(),
available: avail.to_string(),
usage_percent,
});
}
}
}
Ok(mounted_disks)
}
/// Get the physical device for a given device (resolves symlinks, gets parent device)
fn get_physical_device(&self, device: &str) -> Result<String> {
@@ -180,14 +240,14 @@ impl DiskCollector {
.arg("smartctl")
.arg("-H")
.arg(device)
.output()
.output()
{
if output.status.success() {
let output_str = String::from_utf8_lossy(&output.stdout);
let health_status = if output_str.contains("PASSED") {
"PASSED"
} else if output_str.contains("FAILED") {
"FAILED"
"FAILED"
} else {
"UNKNOWN"
};
@@ -197,7 +257,7 @@ impl DiskCollector {
.arg("smartctl")
.arg("-A")
.arg(device)
.output()
.output()
{
let temp_str = String::from_utf8_lossy(&temp_output.stdout);
// Look for temperature in SMART attributes
@@ -222,7 +282,6 @@ impl DiskCollector {
("UNKNOWN".to_string(), 0.0)
}
/// Calculate status based on usage percentage
fn calculate_usage_status(&self, used_bytes: u64, total_bytes: u64) -> Status {
if total_bytes == 0 {
@@ -251,8 +310,8 @@ impl DiskCollector {
// Extract numeric part and unit
let (num_str, unit) = if let Some(last_char) = size_str.chars().last() {
if last_char.is_alphabetic() {
let num_part = &size_str[..size_str.len()-1];
let unit_part = &size_str[size_str.len()-1..];
let num_part = &size_str[..size_str.len() - 1];
let unit_part = &size_str[size_str.len() - 1..];
(num_part, unit_part)
} else {
(size_str, "")
@@ -286,121 +345,142 @@ impl Collector for DiskCollector {
let mut metrics = Vec::new();
// Collect all mounted disks
match self.get_mounted_disks() {
Ok(mounted_disks) => {
debug!("Found {} mounted disks", mounted_disks.len());
// Use UUID-based configured filesystems
let mounted_disks = match self.get_configured_filesystems() {
Ok(configured) => {
debug!("Using UUID-based filesystems: {} found", configured.len());
configured
}
Err(e) => {
debug!("Failed to get configured filesystems: {}", e);
Vec::new()
}
};
// Group disks by physical device to avoid duplicate SMART checks
let mut physical_devices: std::collections::HashMap<String, Vec<&MountedDisk>> = std::collections::HashMap::new();
for disk in &mounted_disks {
physical_devices.entry(disk.physical_device.clone())
.or_insert_with(Vec::new)
.push(disk);
// Process discovered/configured disks
if !mounted_disks.is_empty() {
debug!("Found {} mounted disks", mounted_disks.len());
// Group disks by physical device to avoid duplicate SMART checks
let mut physical_devices: std::collections::HashMap<String, Vec<&MountedDisk>> =
std::collections::HashMap::new();
for disk in &mounted_disks {
physical_devices
.entry(disk.physical_device.clone())
.or_insert_with(Vec::new)
.push(disk);
}
// Generate metrics for each mounted disk
for (disk_index, disk) in mounted_disks.iter().enumerate() {
let timestamp = chrono::Utc::now().timestamp() as u64;
// Use config name if available, otherwise use index
let disk_name = disk.config_name.as_ref()
.map(|name| name.clone())
.unwrap_or_else(|| disk_index.to_string());
// Parse size strings to get actual values for calculations
let size_gb = self.parse_size_to_gb(&disk.size);
let used_gb = self.parse_size_to_gb(&disk.used);
let avail_gb = self.parse_size_to_gb(&disk.available);
// Calculate status based on configured thresholds
let status = if disk.usage_percent >= self.config.usage_critical_percent {
Status::Critical
} else if disk.usage_percent >= self.config.usage_warning_percent {
Status::Warning
} else {
Status::Ok
};
// Device and mount point info
metrics.push(Metric {
name: format!("disk_{}_device", disk_name),
value: MetricValue::String(disk.device.clone()),
unit: None,
description: Some(format!("Device: {}", disk.device)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_mount_point", disk_name),
value: MetricValue::String(disk.mount_point.clone()),
unit: None,
description: Some(format!("Mount: {}", disk.mount_point)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_filesystem", disk_name),
value: MetricValue::String(disk.filesystem.clone()),
unit: None,
description: Some(format!("FS: {}", disk.filesystem)),
status: Status::Ok,
timestamp,
});
// Size metrics
metrics.push(Metric {
name: format!("disk_{}_total_gb", disk_name),
value: MetricValue::Float(size_gb),
unit: Some("GB".to_string()),
description: Some(format!("Total: {}", disk.size)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_used_gb", disk_name),
value: MetricValue::Float(used_gb),
unit: Some("GB".to_string()),
description: Some(format!("Used: {}", disk.used)),
status,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_available_gb", disk_name),
value: MetricValue::Float(avail_gb),
unit: Some("GB".to_string()),
description: Some(format!("Available: {}", disk.available)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_usage_percent", disk_name),
value: MetricValue::Float(disk.usage_percent),
unit: Some("%".to_string()),
description: Some(format!("Usage: {:.1}%", disk.usage_percent)),
status,
timestamp,
});
// Physical device name (for SMART health grouping)
let physical_device_name = disk
.physical_device
.strip_prefix("/dev/")
.unwrap_or(&disk.physical_device);
metrics.push(Metric {
name: format!("disk_{}_physical_device", disk_name),
value: MetricValue::String(physical_device_name.to_string()),
unit: None,
description: Some(format!("Physical: {}", physical_device_name)),
status: Status::Ok,
timestamp,
});
}
// Generate metrics for each mounted disk
for (disk_index, disk) in mounted_disks.iter().enumerate() {
let timestamp = chrono::Utc::now().timestamp() as u64;
// Parse size strings to get actual values for calculations
let size_gb = self.parse_size_to_gb(&disk.size);
let used_gb = self.parse_size_to_gb(&disk.used);
let avail_gb = self.parse_size_to_gb(&disk.available);
// Calculate status based on usage percentage
let status = if disk.usage_percent >= 95.0 {
Status::Critical
} else if disk.usage_percent >= 85.0 {
Status::Warning
} else {
Status::Ok
};
// Device and mount point info
metrics.push(Metric {
name: format!("disk_{}_device", disk_index),
value: MetricValue::String(disk.device.clone()),
unit: None,
description: Some(format!("Device: {}", disk.device)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_mount_point", disk_index),
value: MetricValue::String(disk.mount_point.clone()),
unit: None,
description: Some(format!("Mount: {}", disk.mount_point)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_filesystem", disk_index),
value: MetricValue::String(disk.filesystem.clone()),
unit: None,
description: Some(format!("FS: {}", disk.filesystem)),
status: Status::Ok,
timestamp,
});
// Size metrics
metrics.push(Metric {
name: format!("disk_{}_total_gb", disk_index),
value: MetricValue::Float(size_gb),
unit: Some("GB".to_string()),
description: Some(format!("Total: {}", disk.size)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_used_gb", disk_index),
value: MetricValue::Float(used_gb),
unit: Some("GB".to_string()),
description: Some(format!("Used: {}", disk.used)),
status,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_available_gb", disk_index),
value: MetricValue::Float(avail_gb),
unit: Some("GB".to_string()),
description: Some(format!("Available: {}", disk.available)),
status: Status::Ok,
timestamp,
});
metrics.push(Metric {
name: format!("disk_{}_usage_percent", disk_index),
value: MetricValue::Float(disk.usage_percent),
unit: Some("%".to_string()),
description: Some(format!("Usage: {:.1}%", disk.usage_percent)),
status,
timestamp,
});
// Physical device name (for SMART health grouping)
let physical_device_name = disk.physical_device
.strip_prefix("/dev/")
.unwrap_or(&disk.physical_device);
metrics.push(Metric {
name: format!("disk_{}_physical_device", disk_index),
value: MetricValue::String(physical_device_name.to_string()),
unit: None,
description: Some(format!("Physical: {}", physical_device_name)),
status: Status::Ok,
timestamp,
});
}
// Add SMART health metrics for each unique physical device
for (physical_device, _disks) in physical_devices {
// Add SMART health metrics for each unique physical device
for (physical_device, _disks) in physical_devices {
let (health_status, temperature) = self.get_smart_health(&physical_device);
let device_name = physical_device.strip_prefix("/dev/").unwrap_or(&physical_device);
let device_name = physical_device
.strip_prefix("/dev/")
.unwrap_or(&physical_device);
let timestamp = chrono::Utc::now().timestamp() as u64;
let health_status_enum = match health_status.as_str() {
@@ -438,34 +518,32 @@ impl Collector for DiskCollector {
}
}
// Add disk count metric
metrics.push(Metric {
name: "disk_count".to_string(),
value: MetricValue::Integer(mounted_disks.len() as i64),
unit: None,
description: Some(format!("Total mounted disks: {}", mounted_disks.len())),
status: Status::Ok,
timestamp: chrono::Utc::now().timestamp() as u64,
});
}
Err(e) => {
debug!("Failed to get mounted disks: {}", e);
metrics.push(Metric {
name: "disk_count".to_string(),
value: MetricValue::Integer(0),
unit: None,
description: Some(format!("Error: {}", e)),
status: Status::Unknown,
timestamp: chrono::Utc::now().timestamp() as u64,
});
}
// Add disk count metric
metrics.push(Metric {
name: "disk_count".to_string(),
value: MetricValue::Integer(mounted_disks.len() as i64),
unit: None,
description: Some(format!("Total mounted disks: {}", mounted_disks.len())),
status: Status::Ok,
timestamp: chrono::Utc::now().timestamp() as u64,
});
} else {
// No disks configured - add zero count metric
metrics.push(Metric {
name: "disk_count".to_string(),
value: MetricValue::Integer(0),
unit: None,
description: Some("No disks configured for monitoring".to_string()),
status: Status::Warning,
timestamp: chrono::Utc::now().timestamp() as u64,
});
}
// Monitor /tmp directory size (keep existing functionality)
match self.get_directory_size("/tmp") {
Ok(tmp_size_bytes) => {
let tmp_size_mb = tmp_size_bytes as f64 / (1024.0 * 1024.0);
// Get /tmp filesystem info (usually tmpfs with 2GB limit)
let (total_bytes, _) = match self.get_filesystem_info("/tmp") {
Ok((total, used)) => (total, used),
@@ -520,8 +598,11 @@ impl Collector for DiskCollector {
}
let collection_time = start_time.elapsed();
debug!("Multi-disk collection completed in {:?} with {} metrics",
collection_time, metrics.len());
debug!(
"Multi-disk collection completed in {:?} with {} metrics",
collection_time,
metrics.len()
);
Ok(metrics)
}
@@ -529,4 +610,4 @@ impl Collector for DiskCollector {
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
None // Performance tracking handled by cache system
}
}
}