Add comprehensive hysteresis support to prevent status oscillation near threshold boundaries while maintaining responsive alerting. Key Features: - HysteresisThresholds with configurable upper/lower limits - StatusTracker for per-metric status history - Default gaps: CPU load 10%, memory 5%, disk temp 5°C Updated Components: - CPU load collector (5-minute average with hysteresis) - Memory usage collector (percentage-based thresholds) - Disk temperature collector (SMART data monitoring) - All collectors updated to support StatusTracker interface Cache Interval Adjustments: - Service status: 60s → 10s (faster response) - Disk usage: 300s → 60s (more frequent checks) - Backup status: 900s → 60s (quicker updates) - SMART data: moved to 600s tier (10 minutes) Architecture: - Individual metric status calculation in collectors - Centralized StatusTracker in MetricCollectionManager - Status aggregation preserved in dashboard widgets
618 lines
24 KiB
Rust
618 lines
24 KiB
Rust
use anyhow::Result;
|
|
use async_trait::async_trait;
|
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
|
|
|
use crate::config::DiskConfig;
|
|
use std::fs;
|
|
use std::path::Path;
|
|
use std::process::Command;
|
|
use std::time::Instant;
|
|
use tracing::debug;
|
|
|
|
use super::{Collector, CollectorError, PerformanceMetrics};
|
|
|
|
/// Information about a mounted disk
|
|
#[derive(Debug, Clone)]
|
|
struct MountedDisk {
|
|
device: String, // e.g., "/dev/nvme0n1p1"
|
|
physical_device: String, // e.g., "/dev/nvme0n1"
|
|
mount_point: String, // e.g., "/"
|
|
filesystem: String, // e.g., "ext4"
|
|
size: String, // e.g., "120G"
|
|
used: String, // e.g., "45G"
|
|
available: String, // e.g., "75G"
|
|
usage_percent: f32, // e.g., 38.5
|
|
config_name: Option<String>, // Name from config if UUID-based
|
|
}
|
|
|
|
/// Disk usage collector for monitoring filesystem sizes
|
|
pub struct DiskCollector {
|
|
config: DiskConfig,
|
|
temperature_thresholds: HysteresisThresholds,
|
|
}
|
|
|
|
impl DiskCollector {
|
|
pub fn new(config: DiskConfig) -> Self {
|
|
// Create hysteresis thresholds for disk temperature
|
|
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
|
60.0, // warning at 60°C
|
|
5.0, // 5°C gap for recovery
|
|
70.0, // critical at 70°C
|
|
5.0, // 5°C gap for recovery
|
|
);
|
|
|
|
Self {
|
|
config,
|
|
temperature_thresholds,
|
|
}
|
|
}
|
|
|
|
/// Calculate disk temperature status using hysteresis thresholds
|
|
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
|
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
|
}
|
|
|
|
/// Resolve UUID to actual device path
|
|
fn resolve_uuid_to_device(&self, uuid: &str) -> Result<String> {
|
|
let uuid_path = format!("/dev/disk/by-uuid/{}", uuid);
|
|
|
|
if Path::new(&uuid_path).exists() {
|
|
match fs::read_link(&uuid_path) {
|
|
Ok(target) => {
|
|
// Convert relative path to absolute
|
|
if target.is_relative() {
|
|
let parent = Path::new(&uuid_path).parent().unwrap();
|
|
let resolved = parent.join(&target);
|
|
match resolved.canonicalize() {
|
|
Ok(canonical) => Ok(canonical.to_string_lossy().to_string()),
|
|
Err(_) => Ok(target.to_string_lossy().to_string()),
|
|
}
|
|
} else {
|
|
Ok(target.to_string_lossy().to_string())
|
|
}
|
|
}
|
|
Err(e) => Err(anyhow::anyhow!("Failed to resolve UUID {}: {}", uuid, e)),
|
|
}
|
|
} else {
|
|
Err(anyhow::anyhow!("UUID {} not found in /dev/disk/by-uuid/", uuid))
|
|
}
|
|
}
|
|
|
|
/// Get configured filesystems from UUIDs
|
|
fn get_configured_filesystems(&self) -> Result<Vec<MountedDisk>> {
|
|
let mut configured_disks = Vec::new();
|
|
|
|
for fs_config in &self.config.filesystems {
|
|
if !fs_config.monitor {
|
|
continue;
|
|
}
|
|
|
|
// Resolve UUID to device
|
|
match self.resolve_uuid_to_device(&fs_config.uuid) {
|
|
Ok(device_path) => {
|
|
// Get filesystem stats for the mount point
|
|
match self.get_filesystem_info(&fs_config.mount_point) {
|
|
Ok((total_bytes, used_bytes)) => {
|
|
let available_bytes = total_bytes - used_bytes;
|
|
let usage_percent = if total_bytes > 0 {
|
|
(used_bytes as f64 / total_bytes as f64) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
// Convert bytes to human-readable format
|
|
let size = self.bytes_to_human_readable(total_bytes);
|
|
let used = self.bytes_to_human_readable(used_bytes);
|
|
let available = self.bytes_to_human_readable(available_bytes);
|
|
|
|
// Get physical device for SMART monitoring
|
|
let physical_device = self.get_physical_device(&device_path)?;
|
|
|
|
configured_disks.push(MountedDisk {
|
|
device: device_path.clone(),
|
|
physical_device,
|
|
mount_point: fs_config.mount_point.clone(),
|
|
filesystem: fs_config.fs_type.clone(),
|
|
size,
|
|
used,
|
|
available,
|
|
usage_percent: usage_percent as f32,
|
|
config_name: Some(fs_config.name.clone()),
|
|
});
|
|
|
|
debug!(
|
|
"Configured filesystem '{}' (UUID: {}) mounted at {} using {}",
|
|
fs_config.name, fs_config.uuid, fs_config.mount_point, device_path
|
|
);
|
|
}
|
|
Err(e) => {
|
|
debug!(
|
|
"Failed to get filesystem info for configured filesystem '{}': {}",
|
|
fs_config.name, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
Err(e) => {
|
|
debug!(
|
|
"Failed to resolve UUID for configured filesystem '{}': {}",
|
|
fs_config.name, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(configured_disks)
|
|
}
|
|
|
|
/// Convert bytes to human-readable format
|
|
fn bytes_to_human_readable(&self, bytes: u64) -> String {
|
|
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
|
|
let mut size = bytes as f64;
|
|
let mut unit_index = 0;
|
|
|
|
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
|
|
size /= 1024.0;
|
|
unit_index += 1;
|
|
}
|
|
|
|
if unit_index == 0 {
|
|
format!("{:.0}{}", size, UNITS[unit_index])
|
|
} else {
|
|
format!("{:.1}{}", size, UNITS[unit_index])
|
|
}
|
|
}
|
|
|
|
/// Get directory size using du command (efficient for single directory)
|
|
fn get_directory_size(&self, path: &str) -> Result<u64> {
|
|
let output = Command::new("du")
|
|
.arg("-s")
|
|
.arg("--block-size=1")
|
|
.arg(path)
|
|
.output()?;
|
|
|
|
// du returns success even with permission denied warnings in stderr
|
|
// We only care if the command completely failed or produced no stdout
|
|
let output_str = String::from_utf8(output.stdout)?;
|
|
|
|
if output_str.trim().is_empty() {
|
|
return Err(anyhow::anyhow!(
|
|
"du command produced no output for {}",
|
|
path
|
|
));
|
|
}
|
|
|
|
let size_str = output_str
|
|
.split_whitespace()
|
|
.next()
|
|
.ok_or_else(|| anyhow::anyhow!("Failed to parse du output"))?;
|
|
|
|
let size_bytes = size_str.parse::<u64>()?;
|
|
Ok(size_bytes)
|
|
}
|
|
|
|
/// Get filesystem info using df command
|
|
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
|
|
let output = Command::new("df")
|
|
.arg("--block-size=1")
|
|
.arg(path)
|
|
.output()?;
|
|
|
|
if !output.status.success() {
|
|
return Err(anyhow::anyhow!("df command failed for {}", path));
|
|
}
|
|
|
|
let output_str = String::from_utf8(output.stdout)?;
|
|
let lines: Vec<&str> = output_str.lines().collect();
|
|
|
|
if lines.len() < 2 {
|
|
return Err(anyhow::anyhow!("Unexpected df output format"));
|
|
}
|
|
|
|
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
|
if fields.len() < 4 {
|
|
return Err(anyhow::anyhow!("Unexpected df fields count"));
|
|
}
|
|
|
|
let total_bytes = fields[1].parse::<u64>()?;
|
|
let used_bytes = fields[2].parse::<u64>()?;
|
|
|
|
Ok((total_bytes, used_bytes))
|
|
}
|
|
|
|
|
|
|
|
/// Get the physical device for a given device (resolves symlinks, gets parent device)
|
|
fn get_physical_device(&self, device: &str) -> Result<String> {
|
|
// For NVMe: /dev/nvme0n1p1 -> /dev/nvme0n1
|
|
if device.contains("nvme") && device.contains("p") {
|
|
if let Some(base) = device.split('p').next() {
|
|
return Ok(base.to_string());
|
|
}
|
|
}
|
|
|
|
// For SATA: /dev/sda1 -> /dev/sda
|
|
if device.starts_with("/dev/sd") && device.len() > 8 {
|
|
return Ok(device[..8].to_string()); // Keep /dev/sdX
|
|
}
|
|
|
|
// For VirtIO: /dev/vda1 -> /dev/vda
|
|
if device.starts_with("/dev/vd") && device.len() > 8 {
|
|
return Ok(device[..8].to_string());
|
|
}
|
|
|
|
// If no partition detected, return as-is
|
|
Ok(device.to_string())
|
|
}
|
|
|
|
/// Get SMART health for a specific physical device
|
|
fn get_smart_health(&self, device: &str) -> (String, f32) {
|
|
if let Ok(output) = Command::new("sudo")
|
|
.arg("smartctl")
|
|
.arg("-H")
|
|
.arg(device)
|
|
.output()
|
|
{
|
|
if output.status.success() {
|
|
let output_str = String::from_utf8_lossy(&output.stdout);
|
|
let health_status = if output_str.contains("PASSED") {
|
|
"PASSED"
|
|
} else if output_str.contains("FAILED") {
|
|
"FAILED"
|
|
} else {
|
|
"UNKNOWN"
|
|
};
|
|
|
|
// Try to get temperature
|
|
let temperature = if let Ok(temp_output) = Command::new("sudo")
|
|
.arg("smartctl")
|
|
.arg("-A")
|
|
.arg(device)
|
|
.output()
|
|
{
|
|
let temp_str = String::from_utf8_lossy(&temp_output.stdout);
|
|
// Look for temperature in SMART attributes
|
|
for line in temp_str.lines() {
|
|
if line.contains("Temperature") && line.contains("Celsius") {
|
|
if let Some(temp_part) = line.split_whitespace().nth(9) {
|
|
if let Ok(temp) = temp_part.parse::<f32>() {
|
|
return (health_status.to_string(), temp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
0.0
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
return (health_status.to_string(), temperature);
|
|
}
|
|
}
|
|
|
|
("UNKNOWN".to_string(), 0.0)
|
|
}
|
|
|
|
/// Calculate status based on usage percentage
|
|
fn calculate_usage_status(&self, used_bytes: u64, total_bytes: u64) -> Status {
|
|
if total_bytes == 0 {
|
|
return Status::Unknown;
|
|
}
|
|
|
|
let usage_percent = (used_bytes as f64 / total_bytes as f64) * 100.0;
|
|
|
|
// Thresholds for disk usage
|
|
if usage_percent >= 95.0 {
|
|
Status::Critical
|
|
} else if usage_percent >= 85.0 {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
|
|
/// Parse size string (e.g., "120G", "45M") to GB value
|
|
fn parse_size_to_gb(&self, size_str: &str) -> f32 {
|
|
let size_str = size_str.trim();
|
|
if size_str.is_empty() || size_str == "-" {
|
|
return 0.0;
|
|
}
|
|
|
|
// Extract numeric part and unit
|
|
let (num_str, unit) = if let Some(last_char) = size_str.chars().last() {
|
|
if last_char.is_alphabetic() {
|
|
let num_part = &size_str[..size_str.len() - 1];
|
|
let unit_part = &size_str[size_str.len() - 1..];
|
|
(num_part, unit_part)
|
|
} else {
|
|
(size_str, "")
|
|
}
|
|
} else {
|
|
(size_str, "")
|
|
};
|
|
|
|
let number: f32 = num_str.parse().unwrap_or(0.0);
|
|
|
|
match unit.to_uppercase().as_str() {
|
|
"T" | "TB" => number * 1024.0,
|
|
"G" | "GB" => number,
|
|
"M" | "MB" => number / 1024.0,
|
|
"K" | "KB" => number / (1024.0 * 1024.0),
|
|
"B" | "" => number / (1024.0 * 1024.0 * 1024.0),
|
|
_ => number, // Assume GB if unknown unit
|
|
}
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Collector for DiskCollector {
|
|
fn name(&self) -> &str {
|
|
"disk"
|
|
}
|
|
|
|
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
|
let start_time = Instant::now();
|
|
debug!("Collecting multi-disk metrics");
|
|
|
|
let mut metrics = Vec::new();
|
|
|
|
// Use UUID-based configured filesystems
|
|
let mounted_disks = match self.get_configured_filesystems() {
|
|
Ok(configured) => {
|
|
debug!("Using UUID-based filesystems: {} found", configured.len());
|
|
configured
|
|
}
|
|
Err(e) => {
|
|
debug!("Failed to get configured filesystems: {}", e);
|
|
Vec::new()
|
|
}
|
|
};
|
|
|
|
// Process discovered/configured disks
|
|
if !mounted_disks.is_empty() {
|
|
debug!("Found {} mounted disks", mounted_disks.len());
|
|
|
|
// Group disks by physical device to avoid duplicate SMART checks
|
|
let mut physical_devices: std::collections::HashMap<String, Vec<&MountedDisk>> =
|
|
std::collections::HashMap::new();
|
|
for disk in &mounted_disks {
|
|
physical_devices
|
|
.entry(disk.physical_device.clone())
|
|
.or_insert_with(Vec::new)
|
|
.push(disk);
|
|
}
|
|
|
|
// Generate metrics for each mounted disk
|
|
for (disk_index, disk) in mounted_disks.iter().enumerate() {
|
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
|
|
// Always use index for metric names to maintain dashboard compatibility
|
|
let disk_name = disk_index.to_string();
|
|
|
|
// Parse size strings to get actual values for calculations
|
|
let size_gb = self.parse_size_to_gb(&disk.size);
|
|
let used_gb = self.parse_size_to_gb(&disk.used);
|
|
let avail_gb = self.parse_size_to_gb(&disk.available);
|
|
|
|
// Calculate status based on configured thresholds
|
|
let status = if disk.usage_percent >= self.config.usage_critical_percent {
|
|
Status::Critical
|
|
} else if disk.usage_percent >= self.config.usage_warning_percent {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
};
|
|
|
|
// Device and mount point info
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_device", disk_name),
|
|
value: MetricValue::String(disk.device.clone()),
|
|
unit: None,
|
|
description: Some(format!("Device: {}", disk.device)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_mount_point", disk_name),
|
|
value: MetricValue::String(disk.mount_point.clone()),
|
|
unit: None,
|
|
description: Some(format!("Mount: {}", disk.mount_point)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_filesystem", disk_name),
|
|
value: MetricValue::String(disk.filesystem.clone()),
|
|
unit: None,
|
|
description: Some(format!("FS: {}", disk.filesystem)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
// Size metrics
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_total_gb", disk_name),
|
|
value: MetricValue::Float(size_gb),
|
|
unit: Some("GB".to_string()),
|
|
description: Some(format!("Total: {}", disk.size)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_used_gb", disk_name),
|
|
value: MetricValue::Float(used_gb),
|
|
unit: Some("GB".to_string()),
|
|
description: Some(format!("Used: {}", disk.used)),
|
|
status,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_available_gb", disk_name),
|
|
value: MetricValue::Float(avail_gb),
|
|
unit: Some("GB".to_string()),
|
|
description: Some(format!("Available: {}", disk.available)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_usage_percent", disk_name),
|
|
value: MetricValue::Float(disk.usage_percent),
|
|
unit: Some("%".to_string()),
|
|
description: Some(format!("Usage: {:.1}%", disk.usage_percent)),
|
|
status,
|
|
timestamp,
|
|
});
|
|
|
|
// Physical device name (for SMART health grouping)
|
|
let physical_device_name = disk
|
|
.physical_device
|
|
.strip_prefix("/dev/")
|
|
.unwrap_or(&disk.physical_device);
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_{}_physical_device", disk_name),
|
|
value: MetricValue::String(physical_device_name.to_string()),
|
|
unit: None,
|
|
description: Some(format!("Physical: {}", physical_device_name)),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
});
|
|
}
|
|
|
|
// Add SMART health metrics for each unique physical device
|
|
for (physical_device, _disks) in physical_devices {
|
|
let (health_status, temperature) = self.get_smart_health(&physical_device);
|
|
let device_name = physical_device
|
|
.strip_prefix("/dev/")
|
|
.unwrap_or(&physical_device);
|
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
|
|
let health_status_enum = match health_status.as_str() {
|
|
"PASSED" => Status::Ok,
|
|
"FAILED" => Status::Critical,
|
|
_ => Status::Unknown,
|
|
};
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_smart_{}_health", device_name),
|
|
value: MetricValue::String(health_status.clone()),
|
|
unit: None,
|
|
description: Some(format!("SMART Health: {}", health_status)),
|
|
status: health_status_enum,
|
|
timestamp,
|
|
});
|
|
|
|
if temperature > 0.0 {
|
|
let metric_name = format!("disk_smart_{}_temperature", device_name);
|
|
let temp_status = self.calculate_temperature_status(&metric_name, temperature, status_tracker);
|
|
|
|
metrics.push(Metric {
|
|
name: format!("disk_smart_{}_temperature", device_name),
|
|
value: MetricValue::Float(temperature),
|
|
unit: Some("°C".to_string()),
|
|
description: Some(format!("Temperature: {:.0}°C", temperature)),
|
|
status: temp_status,
|
|
timestamp,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Add disk count metric
|
|
metrics.push(Metric {
|
|
name: "disk_count".to_string(),
|
|
value: MetricValue::Integer(mounted_disks.len() as i64),
|
|
unit: None,
|
|
description: Some(format!("Total mounted disks: {}", mounted_disks.len())),
|
|
status: Status::Ok,
|
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
});
|
|
} else {
|
|
// No disks configured - add zero count metric
|
|
metrics.push(Metric {
|
|
name: "disk_count".to_string(),
|
|
value: MetricValue::Integer(0),
|
|
unit: None,
|
|
description: Some("No disks configured for monitoring".to_string()),
|
|
status: Status::Warning,
|
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
});
|
|
}
|
|
|
|
// Monitor /tmp directory size (keep existing functionality)
|
|
match self.get_directory_size("/tmp") {
|
|
Ok(tmp_size_bytes) => {
|
|
let tmp_size_mb = tmp_size_bytes as f64 / (1024.0 * 1024.0);
|
|
|
|
// Get /tmp filesystem info (usually tmpfs with 2GB limit)
|
|
let (total_bytes, _) = match self.get_filesystem_info("/tmp") {
|
|
Ok((total, used)) => (total, used),
|
|
Err(_) => {
|
|
// Fallback: assume 2GB limit for tmpfs
|
|
(2 * 1024 * 1024 * 1024, tmp_size_bytes)
|
|
}
|
|
};
|
|
|
|
let total_mb = total_bytes as f64 / (1024.0 * 1024.0);
|
|
let usage_percent = (tmp_size_bytes as f64 / total_bytes as f64) * 100.0;
|
|
let status = self.calculate_usage_status(tmp_size_bytes, total_bytes);
|
|
|
|
metrics.push(Metric {
|
|
name: "disk_tmp_size_mb".to_string(),
|
|
value: MetricValue::Float(tmp_size_mb as f32),
|
|
unit: Some("MB".to_string()),
|
|
description: Some(format!("Used: {:.1} MB", tmp_size_mb)),
|
|
status,
|
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: "disk_tmp_total_mb".to_string(),
|
|
value: MetricValue::Float(total_mb as f32),
|
|
unit: Some("MB".to_string()),
|
|
description: Some(format!("Total: {:.1} MB", total_mb)),
|
|
status: Status::Ok,
|
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: "disk_tmp_usage_percent".to_string(),
|
|
value: MetricValue::Float(usage_percent as f32),
|
|
unit: Some("%".to_string()),
|
|
description: Some(format!("Usage: {:.1}%", usage_percent)),
|
|
status,
|
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
});
|
|
}
|
|
Err(e) => {
|
|
debug!("Failed to get /tmp size: {}", e);
|
|
metrics.push(Metric {
|
|
name: "disk_tmp_size_mb".to_string(),
|
|
value: MetricValue::String("error".to_string()),
|
|
unit: Some("MB".to_string()),
|
|
description: Some(format!("Error: {}", e)),
|
|
status: Status::Unknown,
|
|
timestamp: chrono::Utc::now().timestamp() as u64,
|
|
});
|
|
}
|
|
}
|
|
|
|
let collection_time = start_time.elapsed();
|
|
debug!(
|
|
"Multi-disk collection completed in {:?} with {} metrics",
|
|
collection_time,
|
|
metrics.len()
|
|
);
|
|
|
|
Ok(metrics)
|
|
}
|
|
|
|
fn get_performance_metrics(&self) -> Option<PerformanceMetrics> {
|
|
None // Performance tracking handled by cache system
|
|
}
|
|
}
|