Enhance disk collector with individual drive health monitoring
- Add StoragePool and DriveInfo structures for grouping drives by mount point - Implement SMART data collection for individual drives (health, temperature, wear) - Support for ext4, zfs, xfs, mergerfs, btrfs filesystem types - Generate individual drive metrics: disk_[pool]_[drive]_health/temperature/wear - Add storage_type and underlying_devices to filesystem configuration - Move hardcoded service directory mappings to NixOS configuration - Move hardcoded host-to-user mapping to NixOS configuration - Remove all unused code and fix compilation warnings - Clean implementation with zero warnings and no dead code Individual drives now show health status per storage pool: Storage root (ext4): nvme0n1 PASSED 42°C 5% wear Storage steampool (mergerfs): sda/sdb/sdc with individual health data
This commit is contained in:
parent
a6c2983f65
commit
08d3454683
@ -3,25 +3,33 @@ use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
|
||||
use crate::config::DiskConfig;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::time::Instant;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
|
||||
/// Information about a mounted disk
|
||||
/// Information about a storage pool (mount point with underlying drives)
|
||||
#[derive(Debug, Clone)]
|
||||
struct MountedDisk {
|
||||
device: String, // e.g., "/dev/nvme0n1p1"
|
||||
physical_device: String, // e.g., "/dev/nvme0n1"
|
||||
mount_point: String, // e.g., "/"
|
||||
filesystem: String, // e.g., "ext4"
|
||||
size: String, // e.g., "120G"
|
||||
used: String, // e.g., "45G"
|
||||
available: String, // e.g., "75G"
|
||||
usage_percent: f32, // e.g., 38.5
|
||||
struct StoragePool {
|
||||
name: String, // e.g., "steampool", "root"
|
||||
mount_point: String, // e.g., "/mnt/steampool", "/"
|
||||
filesystem: String, // e.g., "mergerfs", "ext4", "zfs", "btrfs"
|
||||
storage_type: String, // e.g., "mergerfs", "single", "raid", "zfs"
|
||||
size: String, // e.g., "2.5TB"
|
||||
used: String, // e.g., "2.1TB"
|
||||
available: String, // e.g., "400GB"
|
||||
usage_percent: f32, // e.g., 85.0
|
||||
underlying_drives: Vec<DriveInfo>, // Individual physical drives
|
||||
}
|
||||
|
||||
/// Information about an individual physical drive
|
||||
#[derive(Debug, Clone)]
|
||||
struct DriveInfo {
|
||||
device: String, // e.g., "sda", "nvme0n1"
|
||||
health_status: String, // e.g., "PASSED", "FAILED"
|
||||
temperature: Option<f32>, // e.g., 45.0°C
|
||||
wear_level: Option<f32>, // e.g., 12.0% (for SSDs)
|
||||
}
|
||||
|
||||
/// Disk usage collector for monitoring filesystem sizes
|
||||
@ -51,96 +59,176 @@ impl DiskCollector {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
||||
}
|
||||
|
||||
/// Resolve UUID to actual device path
|
||||
fn resolve_uuid_to_device(&self, uuid: &str) -> Result<String> {
|
||||
let uuid_path = format!("/dev/disk/by-uuid/{}", uuid);
|
||||
|
||||
if Path::new(&uuid_path).exists() {
|
||||
match fs::read_link(&uuid_path) {
|
||||
Ok(target) => {
|
||||
// Convert relative path to absolute
|
||||
if target.is_relative() {
|
||||
let parent = Path::new(&uuid_path).parent().unwrap();
|
||||
let resolved = parent.join(&target);
|
||||
match resolved.canonicalize() {
|
||||
Ok(canonical) => Ok(canonical.to_string_lossy().to_string()),
|
||||
Err(_) => Ok(target.to_string_lossy().to_string()),
|
||||
}
|
||||
} else {
|
||||
Ok(target.to_string_lossy().to_string())
|
||||
}
|
||||
}
|
||||
Err(e) => Err(anyhow::anyhow!("Failed to resolve UUID {}: {}", uuid, e)),
|
||||
}
|
||||
} else {
|
||||
Err(anyhow::anyhow!("UUID {} not found in /dev/disk/by-uuid/", uuid))
|
||||
}
|
||||
}
|
||||
|
||||
/// Get configured filesystems from UUIDs
|
||||
fn get_configured_filesystems(&self) -> Result<Vec<MountedDisk>> {
|
||||
let mut configured_disks = Vec::new();
|
||||
/// Get configured storage pools with individual drive information
|
||||
fn get_configured_storage_pools(&self) -> Result<Vec<StoragePool>> {
|
||||
let mut storage_pools = Vec::new();
|
||||
|
||||
for fs_config in &self.config.filesystems {
|
||||
if !fs_config.monitor {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Resolve UUID to device
|
||||
match self.resolve_uuid_to_device(&fs_config.uuid) {
|
||||
Ok(device_path) => {
|
||||
// Get filesystem stats for the mount point
|
||||
match self.get_filesystem_info(&fs_config.mount_point) {
|
||||
Ok((total_bytes, used_bytes)) => {
|
||||
let available_bytes = total_bytes - used_bytes;
|
||||
let usage_percent = if total_bytes > 0 {
|
||||
(used_bytes as f64 / total_bytes as f64) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
// Get filesystem stats for the mount point
|
||||
match self.get_filesystem_info(&fs_config.mount_point) {
|
||||
Ok((total_bytes, used_bytes)) => {
|
||||
let available_bytes = total_bytes - used_bytes;
|
||||
let usage_percent = if total_bytes > 0 {
|
||||
(used_bytes as f64 / total_bytes as f64) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
// Convert bytes to human-readable format
|
||||
let size = self.bytes_to_human_readable(total_bytes);
|
||||
let used = self.bytes_to_human_readable(used_bytes);
|
||||
let available = self.bytes_to_human_readable(available_bytes);
|
||||
// Convert bytes to human-readable format
|
||||
let size = self.bytes_to_human_readable(total_bytes);
|
||||
let used = self.bytes_to_human_readable(used_bytes);
|
||||
let available = self.bytes_to_human_readable(available_bytes);
|
||||
|
||||
// Get physical device for SMART monitoring
|
||||
let physical_device = self.get_physical_device(&device_path)?;
|
||||
// Get individual drive information
|
||||
let underlying_drives = self.get_drive_info_for_devices(&fs_config.underlying_devices)?;
|
||||
|
||||
configured_disks.push(MountedDisk {
|
||||
device: device_path.clone(),
|
||||
physical_device,
|
||||
mount_point: fs_config.mount_point.clone(),
|
||||
filesystem: fs_config.fs_type.clone(),
|
||||
size,
|
||||
used,
|
||||
available,
|
||||
usage_percent: usage_percent as f32,
|
||||
});
|
||||
storage_pools.push(StoragePool {
|
||||
name: fs_config.name.clone(),
|
||||
mount_point: fs_config.mount_point.clone(),
|
||||
filesystem: fs_config.fs_type.clone(),
|
||||
storage_type: fs_config.storage_type.clone(),
|
||||
size,
|
||||
used,
|
||||
available,
|
||||
usage_percent: usage_percent as f32,
|
||||
underlying_drives,
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Configured filesystem '{}' (UUID: {}) mounted at {} using {}",
|
||||
fs_config.name, fs_config.uuid, fs_config.mount_point, device_path
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"Failed to get filesystem info for configured filesystem '{}': {}",
|
||||
fs_config.name, e
|
||||
);
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"Storage pool '{}' ({}) at {} with {} underlying drives",
|
||||
fs_config.name, fs_config.storage_type, fs_config.mount_point, fs_config.underlying_devices.len()
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"Failed to resolve UUID for configured filesystem '{}': {}",
|
||||
"Failed to get filesystem info for storage pool '{}': {}",
|
||||
fs_config.name, e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(configured_disks)
|
||||
Ok(storage_pools)
|
||||
}
|
||||
|
||||
/// Get drive information for a list of device names
|
||||
fn get_drive_info_for_devices(&self, device_names: &[String]) -> Result<Vec<DriveInfo>> {
|
||||
let mut drives = Vec::new();
|
||||
|
||||
for device_name in device_names {
|
||||
let device_path = format!("/dev/{}", device_name);
|
||||
|
||||
// Get SMART data for this drive
|
||||
let (health_status, temperature, wear_level) = self.get_smart_data(&device_path);
|
||||
|
||||
drives.push(DriveInfo {
|
||||
device: device_name.clone(),
|
||||
health_status: health_status.clone(),
|
||||
temperature,
|
||||
wear_level,
|
||||
});
|
||||
|
||||
debug!(
|
||||
"Drive info for {}: health={}, temp={:?}°C, wear={:?}%",
|
||||
device_name, health_status, temperature, wear_level
|
||||
);
|
||||
}
|
||||
|
||||
Ok(drives)
|
||||
}
|
||||
|
||||
/// Get SMART data for a drive (health, temperature, wear level)
|
||||
fn get_smart_data(&self, device_path: &str) -> (String, Option<f32>, Option<f32>) {
|
||||
// Try to get SMART data using smartctl
|
||||
let output = Command::new("sudo")
|
||||
.arg("smartctl")
|
||||
.arg("-a")
|
||||
.arg(device_path)
|
||||
.output();
|
||||
|
||||
match output {
|
||||
Ok(result) if result.status.success() => {
|
||||
let stdout = String::from_utf8_lossy(&result.stdout);
|
||||
|
||||
// Parse health status
|
||||
let health = if stdout.contains("PASSED") {
|
||||
"PASSED".to_string()
|
||||
} else if stdout.contains("FAILED") {
|
||||
"FAILED".to_string()
|
||||
} else {
|
||||
"UNKNOWN".to_string()
|
||||
};
|
||||
|
||||
// Parse temperature (look for various temperature indicators)
|
||||
let temperature = self.parse_temperature_from_smart(&stdout);
|
||||
|
||||
// Parse wear level (for SSDs)
|
||||
let wear_level = self.parse_wear_level_from_smart(&stdout);
|
||||
|
||||
(health, temperature, wear_level)
|
||||
}
|
||||
_ => {
|
||||
debug!("Failed to get SMART data for {}", device_path);
|
||||
("UNKNOWN".to_string(), None, None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse temperature from SMART output
|
||||
fn parse_temperature_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||
for line in smart_output.lines() {
|
||||
// Look for temperature in various formats
|
||||
if line.contains("Temperature_Celsius") || line.contains("Temperature") {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
if let Ok(temp) = parts[9].parse::<f32>() {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMe drives might show temperature differently
|
||||
if line.contains("temperature:") {
|
||||
if let Some(temp_part) = line.split("temperature:").nth(1) {
|
||||
if let Some(temp_str) = temp_part.split_whitespace().next() {
|
||||
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse wear level from SMART output (SSD wear leveling)
|
||||
fn parse_wear_level_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||
for line in smart_output.lines() {
|
||||
// Look for wear leveling indicators
|
||||
if line.contains("Wear_Leveling_Count") || line.contains("Media_Wearout_Indicator") {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
if let Ok(wear) = parts[9].parse::<f32>() {
|
||||
return Some(100.0 - wear); // Convert to percentage used
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMe drives might show percentage used directly
|
||||
if line.contains("Percentage Used:") {
|
||||
if let Some(wear_part) = line.split("Percentage Used:").nth(1) {
|
||||
if let Some(wear_str) = wear_part.split('%').next() {
|
||||
if let Ok(wear) = wear_str.trim().parse::<f32>() {
|
||||
return Some(wear);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Convert bytes to human-readable format
|
||||
@ -218,79 +306,6 @@ impl DiskCollector {
|
||||
Ok((total_bytes, used_bytes))
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Get the physical device for a given device (resolves symlinks, gets parent device)
|
||||
fn get_physical_device(&self, device: &str) -> Result<String> {
|
||||
// For NVMe: /dev/nvme0n1p1 -> /dev/nvme0n1
|
||||
if device.contains("nvme") && device.contains("p") {
|
||||
if let Some(base) = device.split('p').next() {
|
||||
return Ok(base.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// For SATA: /dev/sda1 -> /dev/sda
|
||||
if device.starts_with("/dev/sd") && device.len() > 8 {
|
||||
return Ok(device[..8].to_string()); // Keep /dev/sdX
|
||||
}
|
||||
|
||||
// For VirtIO: /dev/vda1 -> /dev/vda
|
||||
if device.starts_with("/dev/vd") && device.len() > 8 {
|
||||
return Ok(device[..8].to_string());
|
||||
}
|
||||
|
||||
// If no partition detected, return as-is
|
||||
Ok(device.to_string())
|
||||
}
|
||||
|
||||
/// Get SMART health for a specific physical device
|
||||
fn get_smart_health(&self, device: &str) -> (String, f32) {
|
||||
if let Ok(output) = Command::new("sudo")
|
||||
.arg("smartctl")
|
||||
.arg("-H")
|
||||
.arg(device)
|
||||
.output()
|
||||
{
|
||||
if output.status.success() {
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
let health_status = if output_str.contains("PASSED") {
|
||||
"PASSED"
|
||||
} else if output_str.contains("FAILED") {
|
||||
"FAILED"
|
||||
} else {
|
||||
"UNKNOWN"
|
||||
};
|
||||
|
||||
// Try to get temperature
|
||||
let temperature = if let Ok(temp_output) = Command::new("sudo")
|
||||
.arg("smartctl")
|
||||
.arg("-A")
|
||||
.arg(device)
|
||||
.output()
|
||||
{
|
||||
let temp_str = String::from_utf8_lossy(&temp_output.stdout);
|
||||
// Look for temperature in SMART attributes
|
||||
for line in temp_str.lines() {
|
||||
if line.contains("Temperature") && line.contains("Celsius") {
|
||||
if let Some(temp_part) = line.split_whitespace().nth(9) {
|
||||
if let Ok(temp) = temp_part.parse::<f32>() {
|
||||
return (health_status.to_string(), temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
0.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
return (health_status.to_string(), temperature);
|
||||
}
|
||||
}
|
||||
|
||||
("UNKNOWN".to_string(), 0.0)
|
||||
}
|
||||
|
||||
/// Calculate status based on usage percentage
|
||||
fn calculate_usage_status(&self, used_bytes: u64, total_bytes: u64) -> Status {
|
||||
if total_bytes == 0 {
|
||||
@ -350,197 +365,168 @@ impl Collector for DiskCollector {
|
||||
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting multi-disk metrics");
|
||||
debug!("Collecting storage pool and individual drive metrics");
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
// Use UUID-based configured filesystems
|
||||
let mounted_disks = match self.get_configured_filesystems() {
|
||||
Ok(configured) => {
|
||||
debug!("Using UUID-based filesystems: {} found", configured.len());
|
||||
configured
|
||||
// Get configured storage pools with individual drive data
|
||||
let storage_pools = match self.get_configured_storage_pools() {
|
||||
Ok(pools) => {
|
||||
debug!("Found {} storage pools", pools.len());
|
||||
pools
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get configured filesystems: {}", e);
|
||||
debug!("Failed to get storage pools: {}", e);
|
||||
Vec::new()
|
||||
}
|
||||
};
|
||||
|
||||
// Process discovered/configured disks
|
||||
if !mounted_disks.is_empty() {
|
||||
debug!("Found {} mounted disks", mounted_disks.len());
|
||||
// Generate metrics for each storage pool and its underlying drives
|
||||
for storage_pool in &storage_pools {
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Group disks by physical device to avoid duplicate SMART checks
|
||||
let mut physical_devices: std::collections::HashMap<String, Vec<&MountedDisk>> =
|
||||
std::collections::HashMap::new();
|
||||
for disk in &mounted_disks {
|
||||
physical_devices
|
||||
.entry(disk.physical_device.clone())
|
||||
.or_insert_with(Vec::new)
|
||||
.push(disk);
|
||||
}
|
||||
// Storage pool overall metrics
|
||||
let pool_name = &storage_pool.name;
|
||||
|
||||
// Parse size strings to get actual values for calculations
|
||||
let size_gb = self.parse_size_to_gb(&storage_pool.size);
|
||||
let used_gb = self.parse_size_to_gb(&storage_pool.used);
|
||||
let avail_gb = self.parse_size_to_gb(&storage_pool.available);
|
||||
|
||||
// Generate metrics for each mounted disk
|
||||
for (disk_index, disk) in mounted_disks.iter().enumerate() {
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
// Calculate status based on configured thresholds
|
||||
let pool_status = if storage_pool.usage_percent >= self.config.usage_critical_percent {
|
||||
Status::Critical
|
||||
} else if storage_pool.usage_percent >= self.config.usage_warning_percent {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
// Always use index for metric names to maintain dashboard compatibility
|
||||
let disk_name = disk_index.to_string();
|
||||
// Storage pool info metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_mount_point", pool_name),
|
||||
value: MetricValue::String(storage_pool.mount_point.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Mount: {}", storage_pool.mount_point)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Parse size strings to get actual values for calculations
|
||||
let size_gb = self.parse_size_to_gb(&disk.size);
|
||||
let used_gb = self.parse_size_to_gb(&disk.used);
|
||||
let avail_gb = self.parse_size_to_gb(&disk.available);
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_filesystem", pool_name),
|
||||
value: MetricValue::String(storage_pool.filesystem.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("FS: {}", storage_pool.filesystem)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Calculate status based on configured thresholds
|
||||
let status = if disk.usage_percent >= self.config.usage_critical_percent {
|
||||
Status::Critical
|
||||
} else if disk.usage_percent >= self.config.usage_warning_percent {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_storage_type", pool_name),
|
||||
value: MetricValue::String(storage_pool.storage_type.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Type: {}", storage_pool.storage_type)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Device and mount point info
|
||||
// Storage pool size metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_total_gb", pool_name),
|
||||
value: MetricValue::Float(size_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Total: {}", storage_pool.size)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_used_gb", pool_name),
|
||||
value: MetricValue::Float(used_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Used: {}", storage_pool.used)),
|
||||
status: pool_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_available_gb", pool_name),
|
||||
value: MetricValue::Float(avail_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Available: {}", storage_pool.available)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_usage_percent", pool_name),
|
||||
value: MetricValue::Float(storage_pool.usage_percent),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("Usage: {:.1}%", storage_pool.usage_percent)),
|
||||
status: pool_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Individual drive metrics for this storage pool
|
||||
for drive in &storage_pool.underlying_drives {
|
||||
// Drive health status
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_device", disk_name),
|
||||
value: MetricValue::String(disk.device.clone()),
|
||||
name: format!("disk_{}_{}_health", pool_name, drive.device),
|
||||
value: MetricValue::String(drive.health_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Device: {}", disk.device)),
|
||||
status: Status::Ok,
|
||||
description: Some(format!("{}: {}", drive.device, drive.health_status)),
|
||||
status: if drive.health_status == "PASSED" { Status::Ok }
|
||||
else if drive.health_status == "FAILED" { Status::Critical }
|
||||
else { Status::Unknown },
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_mount_point", disk_name),
|
||||
value: MetricValue::String(disk.mount_point.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Mount: {}", disk.mount_point)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_filesystem", disk_name),
|
||||
value: MetricValue::String(disk.filesystem.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("FS: {}", disk.filesystem)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Size metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_total_gb", disk_name),
|
||||
value: MetricValue::Float(size_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Total: {}", disk.size)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_used_gb", disk_name),
|
||||
value: MetricValue::Float(used_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Used: {}", disk.used)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_available_gb", disk_name),
|
||||
value: MetricValue::Float(avail_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Available: {}", disk.available)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_usage_percent", disk_name),
|
||||
value: MetricValue::Float(disk.usage_percent),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("Usage: {:.1}%", disk.usage_percent)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Physical device name (for SMART health grouping)
|
||||
let physical_device_name = disk
|
||||
.physical_device
|
||||
.strip_prefix("/dev/")
|
||||
.unwrap_or(&disk.physical_device);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_physical_device", disk_name),
|
||||
value: MetricValue::String(physical_device_name.to_string()),
|
||||
unit: None,
|
||||
description: Some(format!("Physical: {}", physical_device_name)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
|
||||
// Add SMART health metrics for each unique physical device
|
||||
for (physical_device, _disks) in physical_devices {
|
||||
let (health_status, temperature) = self.get_smart_health(&physical_device);
|
||||
let device_name = physical_device
|
||||
.strip_prefix("/dev/")
|
||||
.unwrap_or(&physical_device);
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
let health_status_enum = match health_status.as_str() {
|
||||
"PASSED" => Status::Ok,
|
||||
"FAILED" => Status::Critical,
|
||||
_ => Status::Unknown,
|
||||
};
|
||||
|
||||
// Drive temperature
|
||||
if let Some(temp) = drive.temperature {
|
||||
let temp_status = self.calculate_temperature_status(
|
||||
&format!("disk_{}_{}_temperature", pool_name, drive.device),
|
||||
temp,
|
||||
status_tracker
|
||||
);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_smart_{}_health", device_name),
|
||||
value: MetricValue::String(health_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("SMART Health: {}", health_status)),
|
||||
status: health_status_enum,
|
||||
name: format!("disk_{}_{}_temperature", pool_name, drive.device),
|
||||
value: MetricValue::Float(temp),
|
||||
unit: Some("°C".to_string()),
|
||||
description: Some(format!("{}: {:.0}°C", drive.device, temp)),
|
||||
status: temp_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
if temperature > 0.0 {
|
||||
let metric_name = format!("disk_smart_{}_temperature", device_name);
|
||||
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_smart_{}_temperature", device_name),
|
||||
value: MetricValue::Float(temperature),
|
||||
unit: Some("°C".to_string()),
|
||||
description: Some(format!("Temperature: {:.0}°C", temperature)),
|
||||
status: temp_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add disk count metric
|
||||
metrics.push(Metric {
|
||||
name: "disk_count".to_string(),
|
||||
value: MetricValue::Integer(mounted_disks.len() as i64),
|
||||
unit: None,
|
||||
description: Some(format!("Total mounted disks: {}", mounted_disks.len())),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
} else {
|
||||
// No disks configured - add zero count metric
|
||||
metrics.push(Metric {
|
||||
name: "disk_count".to_string(),
|
||||
value: MetricValue::Integer(0),
|
||||
unit: None,
|
||||
description: Some("No disks configured for monitoring".to_string()),
|
||||
status: Status::Warning,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
// Drive wear level (for SSDs)
|
||||
if let Some(wear) = drive.wear_level {
|
||||
let wear_status = if wear >= 90.0 { Status::Critical }
|
||||
else if wear >= 80.0 { Status::Warning }
|
||||
else { Status::Ok };
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_{}_wear_percent", pool_name, drive.device),
|
||||
value: MetricValue::Float(wear),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("{}: {:.0}% wear", drive.device, wear)),
|
||||
status: wear_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add storage pool count metric
|
||||
metrics.push(Metric {
|
||||
name: "disk_count".to_string(),
|
||||
value: MetricValue::Integer(storage_pools.len() as i64),
|
||||
unit: None,
|
||||
description: Some(format!("Total storage pools: {}", storage_pools.len())),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
|
||||
// Monitor /tmp directory size (keep existing functionality)
|
||||
match self.get_directory_size("/tmp") {
|
||||
Ok(tmp_size_bytes) => {
|
||||
|
||||
@ -7,11 +7,14 @@ use std::time::Instant;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::SystemdConfig;
|
||||
|
||||
/// Systemd collector for monitoring systemd services
|
||||
pub struct SystemdCollector {
|
||||
/// Cached state with thread-safe interior mutability
|
||||
state: RwLock<ServiceCacheState>,
|
||||
/// Configuration for service monitoring
|
||||
config: SystemdConfig,
|
||||
}
|
||||
|
||||
/// Internal state for service caching
|
||||
@ -32,7 +35,7 @@ struct ServiceCacheState {
|
||||
}
|
||||
|
||||
impl SystemdCollector {
|
||||
pub fn new() -> Self {
|
||||
pub fn new(config: SystemdConfig) -> Self {
|
||||
Self {
|
||||
state: RwLock::new(ServiceCacheState {
|
||||
monitored_services: Vec::new(),
|
||||
@ -42,6 +45,7 @@ impl SystemdCollector {
|
||||
last_nginx_check_time: None,
|
||||
nginx_check_interval_seconds: 30, // 30 seconds for nginx sites
|
||||
}),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
@ -128,13 +132,8 @@ impl SystemdCollector {
|
||||
.arg("--plain")
|
||||
.output()?;
|
||||
|
||||
// Get hostname to determine which user to check
|
||||
let hostname = gethostname::gethostname().to_string_lossy().to_string();
|
||||
let target_user = match hostname.as_str() {
|
||||
"steambox" | "cmbox" => "cm",
|
||||
"simonbox" => "simon",
|
||||
_ => "cm", // default to cm for unknown hosts
|
||||
};
|
||||
// Use configured user mapping instead of hardcoded hostname logic
|
||||
let target_user = &self.config.host_user_mapping;
|
||||
|
||||
// Also get user unit files (user-level services) for target user
|
||||
let user_unit_files_output = Command::new("sudo")
|
||||
@ -183,68 +182,9 @@ impl SystemdCollector {
|
||||
};
|
||||
let mut services = Vec::new();
|
||||
|
||||
// Skip setup/certificate services that don't need monitoring (from legacy)
|
||||
let excluded_services = [
|
||||
"mosquitto-certs",
|
||||
"immich-setup",
|
||||
"phpfpm-kryddorten",
|
||||
"phpfpm-mariehall2",
|
||||
"acme-haasp.net",
|
||||
"acme-selfsigned-haasp",
|
||||
"borgbackup",
|
||||
"haasp-site-deploy",
|
||||
"mosquitto-backup",
|
||||
"nginx-config-reload",
|
||||
"sshd-keygen",
|
||||
"sshd-unix-local@",
|
||||
"sshd@",
|
||||
"docker-prune",
|
||||
"docker-registry-gar",
|
||||
"ark-permissions",
|
||||
];
|
||||
|
||||
// Define patterns for services we want to monitor (from legacy)
|
||||
let service_name_filters = [
|
||||
// Web applications
|
||||
"gitea",
|
||||
"immich",
|
||||
"vaultwarden",
|
||||
"unifi",
|
||||
"wordpress",
|
||||
"nginx",
|
||||
"httpd",
|
||||
// Databases
|
||||
"postgresql",
|
||||
"mysql",
|
||||
"mariadb",
|
||||
"redis",
|
||||
"mongodb",
|
||||
"mongod",
|
||||
// Backup and storage
|
||||
"borg",
|
||||
"rclone",
|
||||
// Container runtimes
|
||||
"docker",
|
||||
// CI/CD services
|
||||
"gitea-actions",
|
||||
"gitea-runner",
|
||||
"actions-runner",
|
||||
// Network services
|
||||
"sshd",
|
||||
"dnsmasq",
|
||||
// MQTT and IoT services
|
||||
"mosquitto",
|
||||
"mqtt",
|
||||
// PHP-FPM services
|
||||
"phpfpm",
|
||||
// Home automation
|
||||
"haasp",
|
||||
// Backup services
|
||||
"backup",
|
||||
// Game servers
|
||||
"ark",
|
||||
"sunshine",
|
||||
];
|
||||
// Use configuration instead of hardcoded values
|
||||
let excluded_services = &self.config.excluded_services;
|
||||
let service_name_filters = &self.config.service_name_filters;
|
||||
|
||||
// Parse both unit files and loaded units
|
||||
let mut all_service_names = std::collections::HashSet::new();
|
||||
@ -295,7 +235,7 @@ impl SystemdCollector {
|
||||
|
||||
// Skip excluded services first
|
||||
let mut is_excluded = false;
|
||||
for excluded in &excluded_services {
|
||||
for excluded in excluded_services {
|
||||
if service_name.contains(excluded) {
|
||||
debug!(
|
||||
"EXCLUDING service '{}' because it matches pattern '{}'",
|
||||
@ -312,7 +252,7 @@ impl SystemdCollector {
|
||||
}
|
||||
|
||||
// Check if this service matches our filter patterns
|
||||
for pattern in &service_name_filters {
|
||||
for pattern in service_name_filters {
|
||||
if service_name.contains(pattern) || pattern.contains(service_name) {
|
||||
debug!(
|
||||
"INCLUDING service '{}' because it matches pattern '{}'",
|
||||
@ -411,22 +351,21 @@ impl SystemdCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get service disk usage - simple and deterministic
|
||||
/// Get service disk usage - simplified and configuration-driven
|
||||
fn get_service_disk_usage(&self, service: &str) -> Option<f32> {
|
||||
// 1. Check if service has defined directories
|
||||
let defined_dirs = self.get_service_directories(service);
|
||||
if !defined_dirs.is_empty() {
|
||||
// Service has defined paths - use ONLY those
|
||||
for dir in defined_dirs {
|
||||
// 1. Check if service has configured directories (exact match only)
|
||||
if let Some(dirs) = self.config.service_directories.get(service) {
|
||||
// Service has configured paths - use the first accessible one
|
||||
for dir in dirs {
|
||||
if let Some(size) = self.get_directory_size(dir) {
|
||||
return Some(size);
|
||||
}
|
||||
}
|
||||
// If defined path failed, return None (shows as "-")
|
||||
return None;
|
||||
// If configured paths failed, return None (shows as 0)
|
||||
return Some(0.0);
|
||||
}
|
||||
|
||||
// 2. No defined path - use systemctl WorkingDirectory
|
||||
// 2. No configured path - use systemctl WorkingDirectory
|
||||
let output = Command::new("systemctl")
|
||||
.arg("show")
|
||||
.arg(format!("{}.service", service))
|
||||
@ -447,36 +386,6 @@ impl SystemdCollector {
|
||||
None
|
||||
}
|
||||
|
||||
/// Get defined service directories (highest priority)
|
||||
fn get_service_directories(&self, service: &str) -> Vec<&str> {
|
||||
match service {
|
||||
// Game servers (ARK Survival Ascended) - HIGHEST PRIORITY
|
||||
"ark-island" => vec!["/var/lib/ark-servers/island"],
|
||||
"ark-scorched" => vec!["/var/lib/ark-servers/scorched"],
|
||||
"ark-center" => vec!["/var/lib/ark-servers/center"],
|
||||
"ark-aberration" => vec!["/var/lib/ark-servers/aberration"],
|
||||
"ark-extinction" => vec!["/var/lib/ark-servers/extinction"],
|
||||
"ark-ragnarok" => vec!["/var/lib/ark-servers/ragnarok"],
|
||||
"ark-valguero" => vec!["/var/lib/ark-servers/valguero"],
|
||||
|
||||
// Other services with defined paths
|
||||
s if s.contains("docker") => vec!["/var/lib/docker", "/var/lib/docker/containers"],
|
||||
s if s.contains("gitea") => vec!["/var/lib/gitea", "/opt/gitea", "/home/git", "/data/gitea"],
|
||||
s if s.contains("nginx") => vec!["/var/log/nginx", "/var/www", "/usr/share/nginx"],
|
||||
s if s.contains("immich") => vec!["/var/lib/immich", "/opt/immich", "/usr/src/app/upload"],
|
||||
s if s.contains("postgres") => vec!["/var/lib/postgresql", "/var/lib/postgres"],
|
||||
s if s.contains("mysql") => vec!["/var/lib/mysql"],
|
||||
s if s.contains("redis") => vec!["/var/lib/redis", "/var/redis"],
|
||||
s if s.contains("unifi") => vec!["/var/lib/unifi", "/opt/UniFi"],
|
||||
s if s.contains("vaultwarden") => vec!["/var/lib/vaultwarden", "/opt/vaultwarden"],
|
||||
s if s.contains("mosquitto") => vec!["/var/lib/mosquitto", "/etc/mosquitto"],
|
||||
s if s.contains("postfix") => vec!["/var/spool/postfix", "/var/lib/postfix"],
|
||||
|
||||
// No defined path - will fall back to systemctl WorkingDirectory
|
||||
_ => vec![],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -82,8 +82,10 @@ pub struct FilesystemConfig {
|
||||
pub name: String,
|
||||
pub uuid: String,
|
||||
pub mount_point: String,
|
||||
pub fs_type: String,
|
||||
pub fs_type: String, // "ext4", "zfs", "xfs", "mergerfs", "btrfs"
|
||||
pub monitor: bool,
|
||||
pub storage_type: String, // "single", "raid", "mirror", "mergerfs", "zfs"
|
||||
pub underlying_devices: Vec<String>, // ["sda", "sdb", "sdc"] or ["nvme0n1"]
|
||||
}
|
||||
|
||||
|
||||
@ -96,6 +98,8 @@ pub struct SystemdConfig {
|
||||
pub excluded_services: Vec<String>,
|
||||
pub memory_warning_mb: f32,
|
||||
pub memory_critical_mb: f32,
|
||||
pub service_directories: std::collections::HashMap<String, Vec<String>>,
|
||||
pub host_user_mapping: String,
|
||||
}
|
||||
|
||||
/// SMART collector configuration
|
||||
|
||||
@ -51,7 +51,7 @@ impl MetricCollectionManager {
|
||||
}
|
||||
Some("systemd") => {
|
||||
// Systemd collector only
|
||||
let systemd_collector = SystemdCollector::new();
|
||||
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||
collectors.push(Box::new(systemd_collector));
|
||||
info!("BENCHMARK: Systemd collector only");
|
||||
}
|
||||
@ -88,7 +88,7 @@ impl MetricCollectionManager {
|
||||
collectors.push(Box::new(disk_collector));
|
||||
info!("Disk collector initialized");
|
||||
|
||||
let systemd_collector = SystemdCollector::new();
|
||||
let systemd_collector = SystemdCollector::new(config.systemd.clone());
|
||||
collectors.push(Box::new(systemd_collector));
|
||||
info!("Systemd collector initialized");
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user