diff --git a/agent/src/collectors/mod.rs b/agent/src/collectors/mod.rs index 839525e..06b56d0 100644 --- a/agent/src/collectors/mod.rs +++ b/agent/src/collectors/mod.rs @@ -8,6 +8,7 @@ pub mod disk; pub mod error; pub mod memory; pub mod nixos; +pub mod smart; pub mod systemd; pub use error::CollectorError; diff --git a/agent/src/collectors/smart.rs b/agent/src/collectors/smart.rs new file mode 100644 index 0000000..6e0de94 --- /dev/null +++ b/agent/src/collectors/smart.rs @@ -0,0 +1,191 @@ +use async_trait::async_trait; +use cm_dashboard_shared::{Metric, MetricStatus, MetricValue}; +use std::process::Stdio; +use tokio::process::Command; +use tracing::{debug, warn}; + +use super::{Collector, CollectorError}; + +pub struct SmartCollector { + hostname: String, +} + +impl SmartCollector { + pub fn new(hostname: String) -> Self { + Self { hostname } + } + + /// Get list of storage devices to monitor + async fn get_devices(&self) -> Result, CollectorError> { + let output = Command::new("lsblk") + .args(["-d", "-n", "-o", "NAME,TYPE"]) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .output() + .await + .map_err(|e| CollectorError::Collection(e.to_string()))?; + + if !output.status.success() { + return Ok(Vec::new()); // Return empty if lsblk fails + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let mut devices = Vec::new(); + + for line in stdout.lines() { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 2 && parts[1] == "disk" { + let device_name = parts[0]; + if device_name.starts_with("nvme") || device_name.starts_with("sd") { + devices.push(format!("/dev/{}", device_name)); + } + } + } + + Ok(devices) + } + + /// Collect SMART data for a single device + async fn collect_device_smart(&self, device: &str) -> Result, CollectorError> { + debug!("Collecting SMART data for device: {}", device); + + let output = Command::new("sudo") + .args(["smartctl", "-H", "-A", device]) // Health and attributes only + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .output() + .await + .map_err(|e| CollectorError::Collection(e.to_string()))?; + + if !output.status.success() { + warn!("smartctl failed for device: {}", device); + return Ok(Vec::new()); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + self.parse_smart_output(device, &stdout) + } + + /// Parse smartctl output and create metrics + fn parse_smart_output(&self, device: &str, output: &str) -> Result, CollectorError> { + let mut metrics = Vec::new(); + let device_name = device.trim_start_matches("/dev/"); + + let mut health_ok = true; + let mut temperature: Option = None; + + for line in output.lines() { + let line = line.trim(); + + // Parse health status + if line.contains("SMART overall-health self-assessment") { + if line.contains("FAILED") { + health_ok = false; + } + } + + // Parse temperature from various formats + if (line.contains("Temperature") || line.contains("Airflow_Temperature")) && temperature.is_none() { + if let Some(temp) = self.extract_temperature(line) { + temperature = Some(temp); + } + } + } + + // Create health metric + let health_status = if health_ok { + MetricStatus::Ok + } else { + MetricStatus::Critical + }; + + metrics.push(Metric { + hostname: self.hostname.clone(), + metric_name: format!("smart_health_{}", device_name), + metric_value: MetricValue::String(if health_ok { "PASSED".to_string() } else { "FAILED".to_string() }), + status: health_status, + timestamp: chrono::Utc::now(), + tags: vec![ + ("device".to_string(), device_name.to_string()), + ("type".to_string(), "health".to_string()), + ], + }); + + // Create temperature metric if available + if let Some(temp) = temperature { + let temp_status = if temp >= 70.0 { + MetricStatus::Critical + } else if temp >= 60.0 { + MetricStatus::Warning + } else { + MetricStatus::Ok + }; + + metrics.push(Metric { + hostname: self.hostname.clone(), + metric_name: format!("smart_temperature_{}", device_name), + metric_value: MetricValue::Float(temp), + status: temp_status, + timestamp: chrono::Utc::now(), + tags: vec![ + ("device".to_string(), device_name.to_string()), + ("type".to_string(), "temperature".to_string()), + ("unit".to_string(), "celsius".to_string()), + ], + }); + } + + debug!("Collected {} SMART metrics for {}", metrics.len(), device); + Ok(metrics) + } + + /// Extract temperature value from smartctl output line + fn extract_temperature(&self, line: &str) -> Option { + let parts: Vec<&str> = line.split_whitespace().collect(); + + for (i, part) in parts.iter().enumerate() { + if let Ok(temp) = part.parse::() { + // Check if this looks like a temperature value (reasonable range) + if temp > 0.0 && temp < 150.0 { + // Check context around the number + if i + 1 < parts.len() { + let next = parts[i + 1].to_lowercase(); + if next.contains("celsius") || next.contains("°c") || next == "c" { + return Some(temp); + } + } + // For SMART attribute lines, temperature is often the 10th column + if parts.len() >= 10 && (line.contains("Temperature") || line.contains("Airflow_Temperature")) { + return Some(temp); + } + } + } + } + None + } +} + +#[async_trait] +impl Collector for SmartCollector { + async fn collect(&mut self) -> Result, CollectorError> { + debug!("Starting SMART data collection"); + + let devices = self.get_devices().await?; + let mut all_metrics = Vec::new(); + + for device in devices { + match self.collect_device_smart(&device).await { + Ok(mut metrics) => { + all_metrics.append(&mut metrics); + } + Err(e) => { + warn!("Failed to collect SMART data for {}: {}", device, e); + // Continue with other devices + } + } + } + + debug!("Collected {} total SMART metrics", all_metrics.len()); + Ok(all_metrics) + } +} \ No newline at end of file