- Storage widget: Restructure with Name/Temp/Wear/Usage columns, SMART details as descriptions - Host navigation: Only cycle through connected hosts, no disconnected hosts - Auto-discovery: Skip config files, use predefined CMTEC host list - Maintenance mode: Suppress notifications during backup via /tmp/cm-maintenance file - CPU thresholds: Update to warning ≥9.0, critical ≥10.0 for production use - Agent-dashboard separation: Agent provides descriptions, dashboard displays only
484 lines
16 KiB
Rust
484 lines
16 KiB
Rust
use async_trait::async_trait;
|
|
use chrono::Utc;
|
|
use serde::{Deserialize, Serialize};
|
|
use serde_json::json;
|
|
use std::io::ErrorKind;
|
|
use std::process::Stdio;
|
|
use std::time::Duration;
|
|
use tokio::process::Command;
|
|
use tokio::time::timeout;
|
|
|
|
use super::{AgentType, Collector, CollectorError, CollectorOutput};
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct SmartCollector {
|
|
pub interval: Duration,
|
|
pub devices: Vec<String>,
|
|
pub timeout_ms: u64,
|
|
}
|
|
|
|
impl SmartCollector {
|
|
pub fn new(_enabled: bool, interval_ms: u64, devices: Vec<String>) -> Self {
|
|
Self {
|
|
interval: Duration::from_millis(interval_ms),
|
|
devices,
|
|
timeout_ms: 30000, // 30 second timeout for smartctl
|
|
}
|
|
}
|
|
|
|
async fn is_device_mounted(&self, device: &str) -> bool {
|
|
// Check if device is mounted by looking in /proc/mounts
|
|
if let Ok(mounts) = tokio::fs::read_to_string("/proc/mounts").await {
|
|
for line in mounts.lines() {
|
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
|
if parts.len() >= 2 {
|
|
// Check if this mount point references our device
|
|
// Handle both /dev/nvme0n1p1 style and /dev/sda1 style
|
|
if parts[0].starts_with(&format!("/dev/{}", device)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
async fn get_smart_data(&self, device: &str) -> Result<SmartDeviceData, CollectorError> {
|
|
let timeout_duration = Duration::from_millis(self.timeout_ms);
|
|
|
|
let command_result = timeout(
|
|
timeout_duration,
|
|
Command::new("smartctl")
|
|
.args(["-a", "-j", &format!("/dev/{}", device)])
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped())
|
|
.output(),
|
|
)
|
|
.await
|
|
.map_err(|_| CollectorError::Timeout {
|
|
duration_ms: self.timeout_ms,
|
|
})?;
|
|
|
|
let output = command_result.map_err(|e| match e.kind() {
|
|
ErrorKind::NotFound => CollectorError::ExternalDependency {
|
|
dependency: "smartctl".to_string(),
|
|
message: e.to_string(),
|
|
},
|
|
ErrorKind::PermissionDenied => CollectorError::PermissionDenied {
|
|
message: e.to_string(),
|
|
},
|
|
_ => CollectorError::CommandFailed {
|
|
command: format!("smartctl -a -j /dev/{}", device),
|
|
message: e.to_string(),
|
|
},
|
|
})?;
|
|
|
|
if !output.status.success() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
let stderr_lower = stderr.to_lowercase();
|
|
|
|
if stderr_lower.contains("permission denied") {
|
|
return Err(CollectorError::PermissionDenied {
|
|
message: stderr.to_string(),
|
|
});
|
|
}
|
|
|
|
if stderr_lower.contains("no such device") || stderr_lower.contains("cannot open") {
|
|
return Err(CollectorError::DeviceNotFound {
|
|
device: device.to_string(),
|
|
});
|
|
}
|
|
|
|
return Err(CollectorError::CommandFailed {
|
|
command: format!("smartctl -a -j /dev/{}", device),
|
|
message: stderr.to_string(),
|
|
});
|
|
}
|
|
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
let smart_output: SmartCtlOutput =
|
|
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
|
|
message: format!("Failed to parse smartctl output for {}: {}", device, e),
|
|
})?;
|
|
|
|
Ok(SmartDeviceData::from_smartctl_output(device, smart_output))
|
|
}
|
|
|
|
async fn get_drive_usage(
|
|
&self,
|
|
device: &str,
|
|
) -> Result<(Option<f32>, Option<f32>), CollectorError> {
|
|
// Get capacity first
|
|
let capacity = match self.get_drive_capacity(device).await {
|
|
Ok(cap) => Some(cap),
|
|
Err(_) => None,
|
|
};
|
|
|
|
// Try to get usage information
|
|
// For simplicity, we'll use the root filesystem usage for now
|
|
// In the future, this could be enhanced to map drives to specific mount points
|
|
let usage = if device.contains("nvme0n1") || device.contains("sda") {
|
|
// This is likely the main system drive, use root filesystem usage
|
|
match self.get_disk_usage().await {
|
|
Ok(disk_usage) => Some(disk_usage.used_gb),
|
|
Err(_) => None,
|
|
}
|
|
} else {
|
|
// For other drives, we don't have usage info yet
|
|
None
|
|
};
|
|
|
|
Ok((capacity, usage))
|
|
}
|
|
|
|
async fn get_drive_capacity(&self, device: &str) -> Result<f32, CollectorError> {
|
|
let output = Command::new("lsblk")
|
|
.args(["-J", "-o", "NAME,SIZE", &format!("/dev/{}", device)])
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped())
|
|
.output()
|
|
.await
|
|
.map_err(|e| CollectorError::CommandFailed {
|
|
command: format!("lsblk -J -o NAME,SIZE /dev/{}", device),
|
|
message: e.to_string(),
|
|
})?;
|
|
|
|
if !output.status.success() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
return Err(CollectorError::CommandFailed {
|
|
command: format!("lsblk -J -o NAME,SIZE /dev/{}", device),
|
|
message: stderr.to_string(),
|
|
});
|
|
}
|
|
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
let lsblk_output: serde_json::Value =
|
|
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
|
|
message: format!("Failed to parse lsblk JSON: {}", e),
|
|
})?;
|
|
|
|
// Extract size from the first blockdevice
|
|
if let Some(blockdevices) = lsblk_output["blockdevices"].as_array() {
|
|
if let Some(device_info) = blockdevices.first() {
|
|
if let Some(size_str) = device_info["size"].as_str() {
|
|
return self.parse_lsblk_size(size_str);
|
|
}
|
|
}
|
|
}
|
|
|
|
Err(CollectorError::ParseError {
|
|
message: format!("No size information found for device {}", device),
|
|
})
|
|
}
|
|
|
|
fn parse_lsblk_size(&self, size_str: &str) -> Result<f32, CollectorError> {
|
|
// Parse sizes like "953,9G", "1T", "512M"
|
|
let size_str = size_str.replace(',', "."); // Handle European decimal separator
|
|
|
|
if let Some(pos) = size_str.find(|c: char| c.is_alphabetic()) {
|
|
let (number_part, unit_part) = size_str.split_at(pos);
|
|
let number: f32 = number_part
|
|
.parse()
|
|
.map_err(|e| CollectorError::ParseError {
|
|
message: format!("Failed to parse size number '{}': {}", number_part, e),
|
|
})?;
|
|
|
|
let multiplier = match unit_part.to_uppercase().as_str() {
|
|
"T" | "TB" => 1024.0,
|
|
"G" | "GB" => 1.0,
|
|
"M" | "MB" => 1.0 / 1024.0,
|
|
"K" | "KB" => 1.0 / (1024.0 * 1024.0),
|
|
_ => {
|
|
return Err(CollectorError::ParseError {
|
|
message: format!("Unknown size unit: {}", unit_part),
|
|
})
|
|
}
|
|
};
|
|
|
|
Ok(number * multiplier)
|
|
} else {
|
|
Err(CollectorError::ParseError {
|
|
message: format!("Invalid size format: {}", size_str),
|
|
})
|
|
}
|
|
}
|
|
|
|
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
|
|
let output = Command::new("df")
|
|
.args(["-BG", "--output=size,used,avail", "/"])
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped())
|
|
.output()
|
|
.await
|
|
.map_err(|e| CollectorError::CommandFailed {
|
|
command: "df -BG --output=size,used,avail /".to_string(),
|
|
message: e.to_string(),
|
|
})?;
|
|
|
|
if !output.status.success() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
return Err(CollectorError::CommandFailed {
|
|
command: "df -BG --output=size,used,avail /".to_string(),
|
|
message: stderr.to_string(),
|
|
});
|
|
}
|
|
|
|
let stdout = String::from_utf8_lossy(&output.stdout);
|
|
let lines: Vec<&str> = stdout.lines().collect();
|
|
|
|
if lines.len() < 2 {
|
|
return Err(CollectorError::ParseError {
|
|
message: "Unexpected df output format".to_string(),
|
|
});
|
|
}
|
|
|
|
// Skip header line, parse data line
|
|
let data_line = lines[1].trim();
|
|
let parts: Vec<&str> = data_line.split_whitespace().collect();
|
|
|
|
if parts.len() < 3 {
|
|
return Err(CollectorError::ParseError {
|
|
message: format!("Unexpected df data format: {}", data_line),
|
|
});
|
|
}
|
|
|
|
let parse_size = |s: &str| -> Result<f32, CollectorError> {
|
|
s.trim_end_matches('G')
|
|
.parse::<f32>()
|
|
.map_err(|e| CollectorError::ParseError {
|
|
message: format!("Failed to parse disk size '{}': {}", s, e),
|
|
})
|
|
};
|
|
|
|
Ok(DiskUsage {
|
|
total_gb: parse_size(parts[0])?,
|
|
used_gb: parse_size(parts[1])?,
|
|
available_gb: parse_size(parts[2])?,
|
|
})
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Collector for SmartCollector {
|
|
fn name(&self) -> &str {
|
|
"smart"
|
|
}
|
|
|
|
fn agent_type(&self) -> AgentType {
|
|
AgentType::Smart
|
|
}
|
|
|
|
fn collect_interval(&self) -> Duration {
|
|
self.interval
|
|
}
|
|
|
|
|
|
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
|
|
let mut drives = Vec::new();
|
|
let mut issues = Vec::new();
|
|
let mut healthy = 0;
|
|
let mut warning = 0;
|
|
let mut critical = 0;
|
|
|
|
// Collect data from all configured devices
|
|
for device in &self.devices {
|
|
// Skip unmounted devices
|
|
if !self.is_device_mounted(device).await {
|
|
continue;
|
|
}
|
|
|
|
match self.get_smart_data(device).await {
|
|
Ok(mut drive_data) => {
|
|
// Try to get capacity and usage for this drive
|
|
if let Ok((capacity, usage)) = self.get_drive_usage(device).await {
|
|
drive_data.capacity_gb = capacity;
|
|
drive_data.used_gb = usage;
|
|
}
|
|
match drive_data.health_status.as_str() {
|
|
"PASSED" => healthy += 1,
|
|
"FAILED" => {
|
|
critical += 1;
|
|
issues.push(format!("{}: SMART status FAILED", device));
|
|
}
|
|
_ => {
|
|
warning += 1;
|
|
issues.push(format!("{}: Unknown SMART status", device));
|
|
}
|
|
}
|
|
drives.push(drive_data);
|
|
}
|
|
Err(e) => {
|
|
warning += 1;
|
|
issues.push(format!("{}: {}", device, e));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get disk usage information
|
|
let disk_usage = self.get_disk_usage().await?;
|
|
|
|
let status = if critical > 0 {
|
|
"critical"
|
|
} else if warning > 0 {
|
|
"warning"
|
|
} else {
|
|
"ok"
|
|
};
|
|
|
|
let smart_metrics = json!({
|
|
"status": status,
|
|
"drives": drives,
|
|
"summary": {
|
|
"healthy": healthy,
|
|
"warning": warning,
|
|
"critical": critical,
|
|
"capacity_total_gb": disk_usage.total_gb,
|
|
"capacity_used_gb": disk_usage.used_gb,
|
|
"capacity_available_gb": disk_usage.available_gb
|
|
},
|
|
"issues": issues,
|
|
"timestamp": Utc::now()
|
|
});
|
|
|
|
Ok(CollectorOutput {
|
|
agent_type: AgentType::Smart,
|
|
data: smart_metrics,
|
|
})
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
struct SmartDeviceData {
|
|
name: String,
|
|
temperature_c: f32,
|
|
wear_level: f32,
|
|
power_on_hours: u64,
|
|
available_spare: f32,
|
|
health_status: String,
|
|
capacity_gb: Option<f32>,
|
|
used_gb: Option<f32>,
|
|
#[serde(default)]
|
|
description: Option<Vec<String>>,
|
|
}
|
|
|
|
impl SmartDeviceData {
|
|
fn from_smartctl_output(device: &str, output: SmartCtlOutput) -> Self {
|
|
let temperature_c = output.temperature.and_then(|t| t.current).unwrap_or(0.0);
|
|
|
|
let wear_level = output
|
|
.nvme_smart_health_information_log
|
|
.as_ref()
|
|
.and_then(|nvme| nvme.percentage_used)
|
|
.unwrap_or(0.0);
|
|
|
|
let power_on_hours = output.power_on_time.and_then(|p| p.hours).unwrap_or(0);
|
|
|
|
let available_spare = output
|
|
.nvme_smart_health_information_log
|
|
.as_ref()
|
|
.and_then(|nvme| nvme.available_spare)
|
|
.unwrap_or(100.0);
|
|
|
|
let health_status = output
|
|
.smart_status
|
|
.and_then(|s| s.passed)
|
|
.map(|passed| {
|
|
if passed {
|
|
"PASSED".to_string()
|
|
} else {
|
|
"FAILED".to_string()
|
|
}
|
|
})
|
|
.unwrap_or_else(|| "UNKNOWN".to_string());
|
|
|
|
// Build SMART description with key metrics
|
|
let mut smart_details = Vec::new();
|
|
if available_spare > 0.0 {
|
|
smart_details.push(format!("Spare: {}%", available_spare as u32));
|
|
}
|
|
if power_on_hours > 0 {
|
|
smart_details.push(format!("Hours: {}", power_on_hours));
|
|
}
|
|
|
|
let description = if smart_details.is_empty() {
|
|
None
|
|
} else {
|
|
Some(vec![smart_details.join(", ")])
|
|
};
|
|
|
|
Self {
|
|
name: device.to_string(),
|
|
temperature_c,
|
|
wear_level,
|
|
power_on_hours,
|
|
available_spare,
|
|
health_status,
|
|
capacity_gb: None, // Will be set later by the collector
|
|
used_gb: None, // Will be set later by the collector
|
|
description,
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct DiskUsage {
|
|
total_gb: f32,
|
|
used_gb: f32,
|
|
available_gb: f32,
|
|
}
|
|
|
|
// Minimal smartctl JSON output structure - only the fields we need
|
|
#[derive(Debug, Deserialize)]
|
|
struct SmartCtlOutput {
|
|
temperature: Option<Temperature>,
|
|
power_on_time: Option<PowerOnTime>,
|
|
smart_status: Option<SmartStatus>,
|
|
nvme_smart_health_information_log: Option<NvmeSmartLog>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct Temperature {
|
|
current: Option<f32>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct PowerOnTime {
|
|
hours: Option<u64>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct SmartStatus {
|
|
passed: Option<bool>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct NvmeSmartLog {
|
|
percentage_used: Option<f32>,
|
|
available_spare: Option<f32>,
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_parse_lsblk_size() {
|
|
let collector = SmartCollector::new(true, 5000, vec![]);
|
|
|
|
// Test gigabyte sizes
|
|
assert!((collector.parse_lsblk_size("953,9G").unwrap() - 953.9).abs() < 0.1);
|
|
assert!((collector.parse_lsblk_size("1G").unwrap() - 1.0).abs() < 0.1);
|
|
|
|
// Test terabyte sizes
|
|
assert!((collector.parse_lsblk_size("1T").unwrap() - 1024.0).abs() < 0.1);
|
|
assert!((collector.parse_lsblk_size("2,5T").unwrap() - 2560.0).abs() < 0.1);
|
|
|
|
// Test megabyte sizes
|
|
assert!((collector.parse_lsblk_size("512M").unwrap() - 0.5).abs() < 0.1);
|
|
|
|
// Test error cases
|
|
assert!(collector.parse_lsblk_size("invalid").is_err());
|
|
assert!(collector.parse_lsblk_size("1X").is_err());
|
|
}
|
|
}
|