Restore missing smart collector implementation
Some checks failed
Build and Release / build-and-release (push) Failing after 1m24s

- Rewrite smart collector to match current architecture
- Add back to mod.rs exports
- Fixes infinite smartctl loop issue
- Uses simple health and temperature monitoring
This commit is contained in:
Christoffer Martinsson 2025-10-25 16:59:09 +02:00
parent b310206f1f
commit 83cb43bcf1
2 changed files with 192 additions and 0 deletions

View File

@ -8,6 +8,7 @@ pub mod disk;
pub mod error;
pub mod memory;
pub mod nixos;
pub mod smart;
pub mod systemd;
pub use error::CollectorError;

View File

@ -0,0 +1,191 @@
use async_trait::async_trait;
use cm_dashboard_shared::{Metric, MetricStatus, MetricValue};
use std::process::Stdio;
use tokio::process::Command;
use tracing::{debug, warn};
use super::{Collector, CollectorError};
pub struct SmartCollector {
hostname: String,
}
impl SmartCollector {
pub fn new(hostname: String) -> Self {
Self { hostname }
}
/// Get list of storage devices to monitor
async fn get_devices(&self) -> Result<Vec<String>, CollectorError> {
let output = Command::new("lsblk")
.args(["-d", "-n", "-o", "NAME,TYPE"])
.stdout(Stdio::piped())
.stderr(Stdio::null())
.output()
.await
.map_err(|e| CollectorError::Collection(e.to_string()))?;
if !output.status.success() {
return Ok(Vec::new()); // Return empty if lsblk fails
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut devices = Vec::new();
for line in stdout.lines() {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 && parts[1] == "disk" {
let device_name = parts[0];
if device_name.starts_with("nvme") || device_name.starts_with("sd") {
devices.push(format!("/dev/{}", device_name));
}
}
}
Ok(devices)
}
/// Collect SMART data for a single device
async fn collect_device_smart(&self, device: &str) -> Result<Vec<Metric>, CollectorError> {
debug!("Collecting SMART data for device: {}", device);
let output = Command::new("sudo")
.args(["smartctl", "-H", "-A", device]) // Health and attributes only
.stdout(Stdio::piped())
.stderr(Stdio::null())
.output()
.await
.map_err(|e| CollectorError::Collection(e.to_string()))?;
if !output.status.success() {
warn!("smartctl failed for device: {}", device);
return Ok(Vec::new());
}
let stdout = String::from_utf8_lossy(&output.stdout);
self.parse_smart_output(device, &stdout)
}
/// Parse smartctl output and create metrics
fn parse_smart_output(&self, device: &str, output: &str) -> Result<Vec<Metric>, CollectorError> {
let mut metrics = Vec::new();
let device_name = device.trim_start_matches("/dev/");
let mut health_ok = true;
let mut temperature: Option<f64> = None;
for line in output.lines() {
let line = line.trim();
// Parse health status
if line.contains("SMART overall-health self-assessment") {
if line.contains("FAILED") {
health_ok = false;
}
}
// Parse temperature from various formats
if (line.contains("Temperature") || line.contains("Airflow_Temperature")) && temperature.is_none() {
if let Some(temp) = self.extract_temperature(line) {
temperature = Some(temp);
}
}
}
// Create health metric
let health_status = if health_ok {
MetricStatus::Ok
} else {
MetricStatus::Critical
};
metrics.push(Metric {
hostname: self.hostname.clone(),
metric_name: format!("smart_health_{}", device_name),
metric_value: MetricValue::String(if health_ok { "PASSED".to_string() } else { "FAILED".to_string() }),
status: health_status,
timestamp: chrono::Utc::now(),
tags: vec![
("device".to_string(), device_name.to_string()),
("type".to_string(), "health".to_string()),
],
});
// Create temperature metric if available
if let Some(temp) = temperature {
let temp_status = if temp >= 70.0 {
MetricStatus::Critical
} else if temp >= 60.0 {
MetricStatus::Warning
} else {
MetricStatus::Ok
};
metrics.push(Metric {
hostname: self.hostname.clone(),
metric_name: format!("smart_temperature_{}", device_name),
metric_value: MetricValue::Float(temp),
status: temp_status,
timestamp: chrono::Utc::now(),
tags: vec![
("device".to_string(), device_name.to_string()),
("type".to_string(), "temperature".to_string()),
("unit".to_string(), "celsius".to_string()),
],
});
}
debug!("Collected {} SMART metrics for {}", metrics.len(), device);
Ok(metrics)
}
/// Extract temperature value from smartctl output line
fn extract_temperature(&self, line: &str) -> Option<f64> {
let parts: Vec<&str> = line.split_whitespace().collect();
for (i, part) in parts.iter().enumerate() {
if let Ok(temp) = part.parse::<f64>() {
// Check if this looks like a temperature value (reasonable range)
if temp > 0.0 && temp < 150.0 {
// Check context around the number
if i + 1 < parts.len() {
let next = parts[i + 1].to_lowercase();
if next.contains("celsius") || next.contains("°c") || next == "c" {
return Some(temp);
}
}
// For SMART attribute lines, temperature is often the 10th column
if parts.len() >= 10 && (line.contains("Temperature") || line.contains("Airflow_Temperature")) {
return Some(temp);
}
}
}
}
None
}
}
#[async_trait]
impl Collector for SmartCollector {
async fn collect(&mut self) -> Result<Vec<Metric>, CollectorError> {
debug!("Starting SMART data collection");
let devices = self.get_devices().await?;
let mut all_metrics = Vec::new();
for device in devices {
match self.collect_device_smart(&device).await {
Ok(mut metrics) => {
all_metrics.append(&mut metrics);
}
Err(e) => {
warn!("Failed to collect SMART data for {}: {}", device, e);
// Continue with other devices
}
}
}
debug!("Collected {} total SMART metrics", all_metrics.len());
Ok(all_metrics)
}
}