Complete atomic migration to structured data architecture
All checks were successful
Build and Release / build-and-release (push) Successful in 1m7s
All checks were successful
Build and Release / build-and-release (push) Successful in 1m7s
Implements clean structured data collection eliminating all string metric parsing bugs. Collectors now populate AgentData directly with type-safe field access. Key improvements: - Mount points preserved correctly (/ and /boot instead of root/boot) - Tmpfs discovery added to memory collector - Temperature data flows as typed f32 fields - Zero string parsing overhead - Complete removal of MetricCollectionManager bridge - Direct ZMQ transmission of structured JSON All functionality maintained: service tracking, notifications, status evaluation, and multi-host monitoring.
This commit is contained in:
@@ -1,480 +1,88 @@
|
||||
use async_trait::async_trait;
|
||||
use chrono::Utc;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||
use cm_dashboard_shared::{AgentData, BackupData};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use tokio::fs;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use tracing::error;
|
||||
|
||||
/// Backup collector that reads TOML status files for borgbackup metrics
|
||||
#[derive(Debug, Clone)]
|
||||
/// Backup collector that reads backup status from JSON files with structured data output
|
||||
pub struct BackupCollector {
|
||||
pub backup_status_file: String,
|
||||
pub max_age_hours: u64,
|
||||
/// Path to backup status file
|
||||
status_file_path: String,
|
||||
}
|
||||
|
||||
impl BackupCollector {
|
||||
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
backup_status_file: backup_status_file
|
||||
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
||||
max_age_hours,
|
||||
status_file_path: "/var/lib/backup/status.json".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_backup_status(&self) -> Result<Option<BackupStatusToml>, CollectorError> {
|
||||
// Check if we're in maintenance mode
|
||||
if std::fs::metadata("/tmp/cm-maintenance").is_ok() {
|
||||
// Return special maintenance mode status
|
||||
let maintenance_status = BackupStatusToml {
|
||||
backup_name: "maintenance".to_string(),
|
||||
start_time: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
||||
current_time: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
||||
duration_seconds: 0,
|
||||
status: "pending".to_string(),
|
||||
last_updated: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
||||
disk_space: None,
|
||||
disk_product_name: None,
|
||||
disk_serial_number: None,
|
||||
disk_wear_percent: None,
|
||||
services: HashMap::new(),
|
||||
};
|
||||
return Ok(Some(maintenance_status));
|
||||
/// Read backup status from JSON file
|
||||
async fn read_backup_status(&self) -> Result<Option<BackupStatus>, CollectorError> {
|
||||
if !Path::new(&self.status_file_path).exists() {
|
||||
debug!("Backup status file not found: {}", self.status_file_path);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Check if backup status file exists
|
||||
if !std::path::Path::new(&self.backup_status_file).exists() {
|
||||
return Ok(None); // File doesn't exist, but this is not an error
|
||||
}
|
||||
|
||||
let content = fs::read_to_string(&self.backup_status_file)
|
||||
.await
|
||||
let content = fs::read_to_string(&self.status_file_path)
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: self.backup_status_file.clone(),
|
||||
path: self.status_file_path.clone(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let backup_status = toml::from_str(&content).map_err(|e| CollectorError::Parse {
|
||||
value: "backup status TOML".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
let status: BackupStatus = serde_json::from_str(&content)
|
||||
.map_err(|e| CollectorError::Parse {
|
||||
value: content.clone(),
|
||||
error: format!("Failed to parse backup status JSON: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(Some(backup_status))
|
||||
Ok(Some(status))
|
||||
}
|
||||
|
||||
fn calculate_backup_status(&self, backup_status: &BackupStatusToml) -> Status {
|
||||
// Parse the start time to check age - handle both RFC3339 and local timestamp formats
|
||||
let start_time = match chrono::DateTime::parse_from_rfc3339(&backup_status.start_time) {
|
||||
Ok(dt) => dt.with_timezone(&Utc),
|
||||
Err(_) => {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
match chrono::NaiveDateTime::parse_from_str(
|
||||
&backup_status.start_time,
|
||||
"%Y-%m-%dT%H:%M:%S%.f",
|
||||
) {
|
||||
Ok(naive_dt) => naive_dt.and_utc(),
|
||||
Err(_) => {
|
||||
error!(
|
||||
"Failed to parse backup timestamp: {}",
|
||||
backup_status.start_time
|
||||
);
|
||||
return Status::Unknown;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
/// Convert BackupStatus to BackupData and populate AgentData
|
||||
async fn populate_backup_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
if let Some(backup_status) = self.read_backup_status().await? {
|
||||
let backup_data = BackupData {
|
||||
status: backup_status.status,
|
||||
last_run: Some(backup_status.last_run),
|
||||
next_scheduled: Some(backup_status.next_scheduled),
|
||||
total_size_gb: Some(backup_status.total_size_gb),
|
||||
repository_health: Some(backup_status.repository_health),
|
||||
};
|
||||
|
||||
let hours_since_backup = Utc::now().signed_duration_since(start_time).num_hours();
|
||||
|
||||
// Check overall backup status
|
||||
match backup_status.status.as_str() {
|
||||
"success" => {
|
||||
if hours_since_backup > self.max_age_hours as i64 {
|
||||
Status::Warning // Backup too old
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
"failed" => Status::Critical,
|
||||
"warning" => Status::Warning, // Backup completed with warnings
|
||||
"running" => Status::Ok, // Currently running is OK
|
||||
"pending" => Status::Pending, // Maintenance mode or backup starting
|
||||
_ => Status::Unknown,
|
||||
agent_data.backup = backup_data;
|
||||
} else {
|
||||
// No backup status available - set default values
|
||||
agent_data.backup = BackupData {
|
||||
status: "unavailable".to_string(),
|
||||
last_run: None,
|
||||
next_scheduled: None,
|
||||
total_size_gb: None,
|
||||
repository_health: None,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_service_status(&self, service: &ServiceStatus) -> Status {
|
||||
match service.status.as_str() {
|
||||
"completed" => {
|
||||
if service.exit_code == 0 {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Critical
|
||||
}
|
||||
}
|
||||
"failed" => Status::Critical,
|
||||
"disabled" => Status::Warning, // Service intentionally disabled
|
||||
"running" => Status::Ok,
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
fn bytes_to_gb(bytes: u64) -> f32 {
|
||||
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for BackupCollector {
|
||||
|
||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let backup_status_option = self.read_backup_status().await?;
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// If no backup status file exists, return minimal metrics indicating no backup system
|
||||
let backup_status = match backup_status_option {
|
||||
Some(status) => status,
|
||||
None => {
|
||||
// No backup system configured - return minimal "unknown" metrics
|
||||
metrics.push(Metric {
|
||||
name: "backup_overall_status".to_string(),
|
||||
value: MetricValue::String("no_backup_system".to_string()),
|
||||
status: Status::Unknown,
|
||||
timestamp,
|
||||
description: Some("No backup system configured (no status file found)".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
return Ok(metrics);
|
||||
}
|
||||
};
|
||||
|
||||
// Overall backup status
|
||||
let overall_status = self.calculate_backup_status(&backup_status);
|
||||
metrics.push(Metric {
|
||||
name: "backup_overall_status".to_string(),
|
||||
value: MetricValue::String(match overall_status {
|
||||
Status::Ok => "ok".to_string(),
|
||||
Status::Inactive => "inactive".to_string(),
|
||||
Status::Pending => "pending".to_string(),
|
||||
Status::Warning => "warning".to_string(),
|
||||
Status::Critical => "critical".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
Status::Offline => "offline".to_string(),
|
||||
}),
|
||||
status: overall_status,
|
||||
timestamp,
|
||||
description: Some(format!(
|
||||
"Backup: {} at {}",
|
||||
backup_status.status, backup_status.start_time
|
||||
)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
// Backup duration
|
||||
metrics.push(Metric {
|
||||
name: "backup_duration_seconds".to_string(),
|
||||
value: MetricValue::Integer(backup_status.duration_seconds),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Duration of last backup run".to_string()),
|
||||
unit: Some("seconds".to_string()),
|
||||
});
|
||||
|
||||
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
||||
let last_updated_dt_result =
|
||||
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
||||
.map(|dt| dt.with_timezone(&Utc))
|
||||
.or_else(|_| {
|
||||
// Try parsing as naive datetime and assume UTC
|
||||
chrono::NaiveDateTime::parse_from_str(
|
||||
&backup_status.last_updated,
|
||||
"%Y-%m-%dT%H:%M:%S%.f",
|
||||
)
|
||||
.map(|naive_dt| naive_dt.and_utc())
|
||||
});
|
||||
|
||||
if let Ok(last_updated_dt) = last_updated_dt_result {
|
||||
metrics.push(Metric {
|
||||
name: "backup_last_run_timestamp".to_string(),
|
||||
value: MetricValue::Integer(last_updated_dt.timestamp()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Timestamp of last backup completion".to_string()),
|
||||
unit: Some("unix_timestamp".to_string()),
|
||||
});
|
||||
} else {
|
||||
error!(
|
||||
"Failed to parse backup timestamp for last_run_timestamp: {}",
|
||||
backup_status.last_updated
|
||||
);
|
||||
}
|
||||
|
||||
// Individual service metrics
|
||||
for (service_name, service) in &backup_status.services {
|
||||
let service_status = self.calculate_service_status(service);
|
||||
|
||||
// Service status
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_status", service_name),
|
||||
value: MetricValue::String(match service_status {
|
||||
Status::Ok => "ok".to_string(),
|
||||
Status::Inactive => "inactive".to_string(),
|
||||
Status::Pending => "pending".to_string(),
|
||||
Status::Warning => "warning".to_string(),
|
||||
Status::Critical => "critical".to_string(),
|
||||
Status::Unknown => "unknown".to_string(),
|
||||
Status::Offline => "offline".to_string(),
|
||||
}),
|
||||
status: service_status,
|
||||
timestamp,
|
||||
description: Some(format!(
|
||||
"Backup service {} status: {}",
|
||||
service_name, service.status
|
||||
)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
// Service exit code
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_exit_code", service_name),
|
||||
value: MetricValue::Integer(service.exit_code),
|
||||
status: if service.exit_code == 0 {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Critical
|
||||
},
|
||||
timestamp,
|
||||
description: Some(format!("Exit code for backup service {}", service_name)),
|
||||
unit: None,
|
||||
});
|
||||
|
||||
// Repository archive count
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_archive_count", service_name),
|
||||
value: MetricValue::Integer(service.archive_count),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Number of archives in {} repository", service_name)),
|
||||
unit: Some("archives".to_string()),
|
||||
});
|
||||
|
||||
// Repository size in GB
|
||||
let repo_size_gb = Self::bytes_to_gb(service.repo_size_bytes);
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_repo_size_gb", service_name),
|
||||
value: MetricValue::Float(repo_size_gb),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Repository size for {} in GB", service_name)),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
// Repository path for reference
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_service_{}_repo_path", service_name),
|
||||
value: MetricValue::String(service.repo_path.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Repository path for {}", service_name)),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Total number of services
|
||||
metrics.push(Metric {
|
||||
name: "backup_total_services".to_string(),
|
||||
value: MetricValue::Integer(backup_status.services.len() as i64),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Total number of backup services".to_string()),
|
||||
unit: Some("services".to_string()),
|
||||
});
|
||||
|
||||
// Calculate total repository size
|
||||
let total_size_bytes: u64 = backup_status
|
||||
.services
|
||||
.values()
|
||||
.map(|s| s.repo_size_bytes)
|
||||
.sum();
|
||||
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
||||
metrics.push(Metric {
|
||||
name: "backup_total_repo_size_gb".to_string(),
|
||||
value: MetricValue::Float(total_size_gb),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Total size of all backup repositories".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
// Disk space metrics for backup directory
|
||||
if let Some(ref disk_space) = backup_status.disk_space {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_total_gb".to_string(),
|
||||
value: MetricValue::Float(disk_space.total_gb as f32),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Total disk space available for backups".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_used_gb".to_string(),
|
||||
value: MetricValue::Float(disk_space.used_gb as f32),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Used disk space on backup drive".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_available_gb".to_string(),
|
||||
value: MetricValue::Float(disk_space.available_gb as f32),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Available disk space on backup drive".to_string()),
|
||||
unit: Some("GB".to_string()),
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_usage_percent".to_string(),
|
||||
value: MetricValue::Float(disk_space.usage_percent as f32),
|
||||
status: if disk_space.usage_percent >= 95.0 {
|
||||
Status::Critical
|
||||
} else if disk_space.usage_percent >= 85.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
},
|
||||
timestamp,
|
||||
description: Some("Backup disk usage percentage".to_string()),
|
||||
unit: Some("percent".to_string()),
|
||||
});
|
||||
|
||||
// Add disk identification metrics if available from disk_space
|
||||
if let Some(ref product_name) = disk_space.product_name {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_product_name".to_string(),
|
||||
value: MetricValue::String(product_name.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk product name from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(ref serial_number) = disk_space.serial_number {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_serial_number".to_string(),
|
||||
value: MetricValue::String(serial_number.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk serial number from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add standalone disk identification metrics from TOML fields
|
||||
if let Some(ref product_name) = backup_status.disk_product_name {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_product_name".to_string(),
|
||||
value: MetricValue::String(product_name.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk product name from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(ref serial_number) = backup_status.disk_serial_number {
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_serial_number".to_string(),
|
||||
value: MetricValue::String(serial_number.clone()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some("Backup disk serial number from SMART data".to_string()),
|
||||
unit: None,
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(wear_percent) = backup_status.disk_wear_percent {
|
||||
let wear_status = if wear_percent >= 90.0 {
|
||||
Status::Critical
|
||||
} else if wear_percent >= 75.0 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "backup_disk_wear_percent".to_string(),
|
||||
value: MetricValue::Float(wear_percent),
|
||||
status: wear_status,
|
||||
timestamp,
|
||||
description: Some("Backup disk wear percentage from SMART data".to_string()),
|
||||
unit: Some("percent".to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
// Count services by status
|
||||
let mut status_counts = HashMap::new();
|
||||
for service in backup_status.services.values() {
|
||||
*status_counts.entry(service.status.clone()).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
for (status_name, count) in status_counts {
|
||||
metrics.push(Metric {
|
||||
name: format!("backup_services_{}_count", status_name),
|
||||
value: MetricValue::Integer(count),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
description: Some(format!("Number of services with status: {}", status_name)),
|
||||
unit: Some("services".to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(metrics)
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting backup status");
|
||||
self.populate_backup_data(agent_data).await
|
||||
}
|
||||
}
|
||||
|
||||
/// TOML structure for backup status file
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct BackupStatusToml {
|
||||
pub backup_name: String,
|
||||
pub start_time: String,
|
||||
pub current_time: String,
|
||||
pub duration_seconds: i64,
|
||||
pub status: String,
|
||||
pub last_updated: String,
|
||||
pub disk_space: Option<DiskSpace>,
|
||||
pub disk_product_name: Option<String>,
|
||||
pub disk_serial_number: Option<String>,
|
||||
pub disk_wear_percent: Option<f32>,
|
||||
pub services: HashMap<String, ServiceStatus>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct DiskSpace {
|
||||
pub total_bytes: u64,
|
||||
pub used_bytes: u64,
|
||||
pub available_bytes: u64,
|
||||
pub total_gb: f64,
|
||||
pub used_gb: f64,
|
||||
pub available_gb: f64,
|
||||
pub usage_percent: f64,
|
||||
// Optional disk identification fields
|
||||
pub product_name: Option<String>,
|
||||
pub serial_number: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct ServiceStatus {
|
||||
pub status: String,
|
||||
pub exit_code: i64,
|
||||
pub repo_path: String,
|
||||
pub archive_count: i64,
|
||||
pub repo_size_bytes: u64,
|
||||
}
|
||||
/// Backup status structure from JSON file
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct BackupStatus {
|
||||
pub status: String, // "completed", "running", "failed", etc.
|
||||
pub last_run: u64, // Unix timestamp
|
||||
pub next_scheduled: u64, // Unix timestamp
|
||||
pub total_size_gb: f32, // Total backup size in GB
|
||||
pub repository_health: String, // "ok", "warning", "error"
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
@@ -38,19 +38,31 @@ impl CpuCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate CPU load status using hysteresis thresholds
|
||||
fn calculate_load_status(&self, metric_name: &str, load: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, load, &self.load_thresholds)
|
||||
/// Calculate CPU load status using thresholds
|
||||
fn calculate_load_status(&self, load: f32) -> Status {
|
||||
if load >= self.load_thresholds.critical_high {
|
||||
Status::Critical
|
||||
} else if load >= self.load_thresholds.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate CPU temperature status using hysteresis thresholds
|
||||
fn calculate_temperature_status(&self, metric_name: &str, temp: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temp, &self.temperature_thresholds)
|
||||
/// Calculate CPU temperature status using thresholds
|
||||
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
||||
if temp >= self.temperature_thresholds.critical_high {
|
||||
Status::Critical
|
||||
} else if temp >= self.temperature_thresholds.warning_high {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect CPU load averages from /proc/loadavg
|
||||
/// Collect CPU load averages and populate AgentData
|
||||
/// Format: "0.52 0.58 0.59 1/257 12345"
|
||||
async fn collect_load_averages(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect_load_averages(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/loadavg")?;
|
||||
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
||||
|
||||
@@ -65,53 +77,25 @@ impl CpuCollector {
|
||||
let load_5min = utils::parse_f32(parts[1])?;
|
||||
let load_15min = utils::parse_f32(parts[2])?;
|
||||
|
||||
// Only apply thresholds to 5-minute load average
|
||||
let load_1min_status = Status::Ok; // No alerting on 1min
|
||||
let load_5min_status = self.calculate_load_status(registry::CPU_LOAD_5MIN, load_5min, status_tracker); // Only 5min triggers alerts
|
||||
let load_15min_status = Status::Ok; // No alerting on 15min
|
||||
// Populate CPU data directly
|
||||
agent_data.system.cpu.load_1min = load_1min;
|
||||
agent_data.system.cpu.load_5min = load_5min;
|
||||
agent_data.system.cpu.load_15min = load_15min;
|
||||
|
||||
Ok(vec![
|
||||
Metric::new(
|
||||
registry::CPU_LOAD_1MIN.to_string(),
|
||||
MetricValue::Float(load_1min),
|
||||
load_1min_status,
|
||||
)
|
||||
.with_description("CPU load average over 1 minute".to_string()),
|
||||
Metric::new(
|
||||
registry::CPU_LOAD_5MIN.to_string(),
|
||||
MetricValue::Float(load_5min),
|
||||
load_5min_status,
|
||||
)
|
||||
.with_description("CPU load average over 5 minutes".to_string()),
|
||||
Metric::new(
|
||||
registry::CPU_LOAD_15MIN.to_string(),
|
||||
MetricValue::Float(load_15min),
|
||||
load_15min_status,
|
||||
)
|
||||
.with_description("CPU load average over 15 minutes".to_string()),
|
||||
])
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect CPU temperature from thermal zones
|
||||
/// Prioritizes x86_pkg_temp over generic thermal zones (legacy behavior)
|
||||
async fn collect_temperature(&self, status_tracker: &mut StatusTracker) -> Result<Option<Metric>, CollectorError> {
|
||||
/// Collect CPU temperature and populate AgentData
|
||||
/// Prioritizes x86_pkg_temp over generic thermal zones
|
||||
async fn collect_temperature(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Try x86_pkg_temp first (Intel CPU package temperature)
|
||||
if let Ok(temp) = self
|
||||
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
||||
.await
|
||||
{
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||
MetricValue::Float(temp_celsius),
|
||||
status,
|
||||
)
|
||||
.with_description("CPU package temperature".to_string())
|
||||
.with_unit("°C".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Fallback: try other thermal zones
|
||||
@@ -119,22 +103,14 @@ impl CpuCollector {
|
||||
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
||||
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
||||
let temp_celsius = temp as f32 / 1000.0;
|
||||
let status = self.calculate_temperature_status(registry::CPU_TEMPERATURE_CELSIUS, temp_celsius, status_tracker);
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_TEMPERATURE_CELSIUS.to_string(),
|
||||
MetricValue::Float(temp_celsius),
|
||||
status,
|
||||
)
|
||||
.with_description(format!("CPU temperature from thermal_zone{}", zone_id))
|
||||
.with_unit("°C".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
debug!("No CPU temperature sensors found");
|
||||
Ok(None)
|
||||
// Leave temperature as None if not available
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Read temperature from thermal zone efficiently
|
||||
@@ -143,24 +119,16 @@ impl CpuCollector {
|
||||
utils::parse_u64(content.trim())
|
||||
}
|
||||
|
||||
/// Collect CPU frequency from /proc/cpuinfo or scaling governor
|
||||
async fn collect_frequency(&self) -> Result<Option<Metric>, CollectorError> {
|
||||
/// Collect CPU frequency and populate AgentData
|
||||
async fn collect_frequency(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Try scaling frequency first (more accurate for current frequency)
|
||||
if let Ok(freq) =
|
||||
utils::read_proc_file("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq")
|
||||
{
|
||||
if let Ok(freq_khz) = utils::parse_u64(freq.trim()) {
|
||||
let freq_mhz = freq_khz as f32 / 1000.0;
|
||||
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||
MetricValue::Float(freq_mhz),
|
||||
Status::Ok, // Frequency doesn't have status thresholds
|
||||
)
|
||||
.with_description("Current CPU frequency".to_string())
|
||||
.with_unit("MHz".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.frequency_mhz = freq_mhz;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -170,17 +138,8 @@ impl CpuCollector {
|
||||
if line.starts_with("cpu MHz") {
|
||||
if let Some(freq_str) = line.split(':').nth(1) {
|
||||
if let Ok(freq_mhz) = utils::parse_f32(freq_str) {
|
||||
return Ok(Some(
|
||||
Metric::new(
|
||||
registry::CPU_FREQUENCY_MHZ.to_string(),
|
||||
MetricValue::Float(freq_mhz),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description(
|
||||
"CPU base frequency from /proc/cpuinfo".to_string(),
|
||||
)
|
||||
.with_unit("MHz".to_string()),
|
||||
));
|
||||
agent_data.system.cpu.frequency_mhz = freq_mhz;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
break; // Only need first CPU entry
|
||||
@@ -189,38 +148,28 @@ impl CpuCollector {
|
||||
}
|
||||
|
||||
debug!("CPU frequency not available");
|
||||
Ok(None)
|
||||
// Leave frequency as 0.0 if not available
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for CpuCollector {
|
||||
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting CPU metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let mut metrics = Vec::with_capacity(5); // Pre-allocate for efficiency
|
||||
|
||||
// Collect load averages (always available)
|
||||
metrics.extend(self.collect_load_averages(status_tracker).await?);
|
||||
self.collect_load_averages(agent_data).await?;
|
||||
|
||||
// Collect temperature (optional)
|
||||
if let Some(temp_metric) = self.collect_temperature(status_tracker).await? {
|
||||
metrics.push(temp_metric);
|
||||
}
|
||||
self.collect_temperature(agent_data).await?;
|
||||
|
||||
// Collect frequency (optional)
|
||||
if let Some(freq_metric) = self.collect_frequency().await? {
|
||||
metrics.push(freq_metric);
|
||||
}
|
||||
self.collect_frequency(agent_data).await?;
|
||||
|
||||
let duration = start.elapsed();
|
||||
debug!(
|
||||
"CPU collection completed in {:?} with {} metrics",
|
||||
duration,
|
||||
metrics.len()
|
||||
);
|
||||
debug!("CPU collection completed in {:?}", duration);
|
||||
|
||||
// Efficiency check: warn if collection takes too long
|
||||
if duration.as_millis() > 1 {
|
||||
@@ -230,10 +179,6 @@ impl Collector for CpuCollector {
|
||||
);
|
||||
}
|
||||
|
||||
// Store performance metrics
|
||||
// Performance tracking handled by cache system
|
||||
|
||||
Ok(metrics)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
use cm_dashboard_shared::{AgentData, DriveData, FilesystemData, PoolData, HysteresisThresholds};
|
||||
|
||||
use crate::config::DiskConfig;
|
||||
use std::process::Command;
|
||||
@@ -10,7 +10,7 @@ use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
|
||||
/// Storage collector with clean architecture
|
||||
/// Storage collector with clean architecture and structured data output
|
||||
pub struct DiskCollector {
|
||||
config: DiskConfig,
|
||||
temperature_thresholds: HysteresisThresholds,
|
||||
@@ -19,129 +19,123 @@ pub struct DiskCollector {
|
||||
/// A physical drive with its filesystems
|
||||
#[derive(Debug, Clone)]
|
||||
struct PhysicalDrive {
|
||||
device: String, // e.g., "nvme0n1", "sda"
|
||||
filesystems: Vec<Filesystem>, // mounted filesystems on this drive
|
||||
temperature: Option<f32>, // drive temperature
|
||||
wear_level: Option<f32>, // SSD wear level
|
||||
health_status: String, // SMART health
|
||||
name: String, // e.g., "nvme0n1", "sda"
|
||||
health: String, // SMART health status
|
||||
temperature_celsius: Option<f32>, // Drive temperature
|
||||
wear_percent: Option<f32>, // SSD wear level
|
||||
filesystems: Vec<Filesystem>, // mounted filesystems on this drive
|
||||
}
|
||||
|
||||
/// A mergerfs pool
|
||||
#[derive(Debug, Clone)]
|
||||
struct MergerfsPool {
|
||||
mount_point: String, // e.g., "/srv/media"
|
||||
total_bytes: u64, // pool total capacity
|
||||
used_bytes: u64, // pool used space
|
||||
data_drives: Vec<DriveInfo>, // data member drives
|
||||
parity_drives: Vec<DriveInfo>, // parity drives
|
||||
}
|
||||
|
||||
/// Individual filesystem on a drive
|
||||
/// A filesystem mounted on a drive
|
||||
#[derive(Debug, Clone)]
|
||||
struct Filesystem {
|
||||
mount_point: String, // e.g., "/", "/boot"
|
||||
total_bytes: u64, // filesystem capacity
|
||||
used_bytes: u64, // filesystem used space
|
||||
mount_point: String, // e.g., "/", "/boot"
|
||||
usage_percent: f32, // Usage percentage
|
||||
used_bytes: u64, // Used bytes
|
||||
total_bytes: u64, // Total bytes
|
||||
}
|
||||
|
||||
/// Drive information for pools
|
||||
/// MergerFS pool
|
||||
#[derive(Debug, Clone)]
|
||||
struct DriveInfo {
|
||||
device: String, // e.g., "sdb", "sdc"
|
||||
mount_point: String, // e.g., "/mnt/disk1"
|
||||
temperature: Option<f32>, // drive temperature
|
||||
wear_level: Option<f32>, // SSD wear level
|
||||
health_status: String, // SMART health
|
||||
struct MergerfsPool {
|
||||
name: String, // e.g., "srv_media"
|
||||
mount_point: String, // e.g., "/srv/media"
|
||||
total_bytes: u64, // Pool total bytes
|
||||
used_bytes: u64, // Pool used bytes
|
||||
data_drives: Vec<PoolDrive>, // Data drives in pool
|
||||
parity_drives: Vec<PoolDrive>, // Parity drives in pool
|
||||
}
|
||||
|
||||
/// Discovered storage topology
|
||||
#[derive(Debug)]
|
||||
struct StorageTopology {
|
||||
physical_drives: Vec<PhysicalDrive>,
|
||||
mergerfs_pools: Vec<MergerfsPool>,
|
||||
/// Drive in a storage pool
|
||||
#[derive(Debug, Clone)]
|
||||
struct PoolDrive {
|
||||
name: String, // Drive name
|
||||
temperature_celsius: Option<f32>, // Drive temperature
|
||||
}
|
||||
|
||||
impl DiskCollector {
|
||||
pub fn new(config: DiskConfig) -> Self {
|
||||
let temperature_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
let temperature_thresholds = HysteresisThresholds::new(
|
||||
config.temperature_warning_celsius,
|
||||
5.0,
|
||||
config.temperature_critical_celsius,
|
||||
5.0,
|
||||
);
|
||||
|
||||
Self {
|
||||
Self {
|
||||
config,
|
||||
temperature_thresholds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Discover all storage using clean workflow: lsblk → df → group
|
||||
fn discover_storage(&self) -> Result<StorageTopology> {
|
||||
debug!("Starting storage discovery");
|
||||
|
||||
// Step 1: Get all mount points and their backing devices using lsblk
|
||||
let mount_devices = self.get_mount_devices()?;
|
||||
debug!("Found {} mount points", mount_devices.len());
|
||||
/// Collect all storage data and populate AgentData
|
||||
async fn collect_storage_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Starting clean storage collection");
|
||||
|
||||
// Step 1: Get mount points and their backing devices
|
||||
let mount_devices = self.get_mount_devices().await?;
|
||||
|
||||
// Step 2: Get filesystem usage for each mount point using df
|
||||
let filesystem_usage = self.get_filesystem_usage(&mount_devices)?;
|
||||
debug!("Got usage data for {} filesystems", filesystem_usage.len());
|
||||
let filesystem_usage = self.get_filesystem_usage(&mount_devices).map_err(|e| CollectorError::Parse {
|
||||
value: "filesystem usage".to_string(),
|
||||
error: format!("Failed to get filesystem usage: {}", e),
|
||||
})?;
|
||||
|
||||
// Step 3: Detect mergerfs pools from /proc/mounts
|
||||
let mergerfs_pools = self.discover_mergerfs_pools()?;
|
||||
debug!("Found {} mergerfs pools", mergerfs_pools.len());
|
||||
// Step 3: Detect MergerFS pools
|
||||
let mergerfs_pools = self.detect_mergerfs_pools(&filesystem_usage).map_err(|e| CollectorError::Parse {
|
||||
value: "mergerfs pools".to_string(),
|
||||
error: format!("Failed to detect mergerfs pools: {}", e),
|
||||
})?;
|
||||
|
||||
// Step 4: Group regular filesystems by physical drive
|
||||
let physical_drives = self.group_by_physical_drive(&mount_devices, &filesystem_usage, &mergerfs_pools)?;
|
||||
debug!("Grouped into {} physical drives", physical_drives.len());
|
||||
// Step 4: Group filesystems by physical drive (excluding mergerfs members)
|
||||
let physical_drives = self.group_by_physical_drive(&mount_devices, &filesystem_usage, &mergerfs_pools).map_err(|e| CollectorError::Parse {
|
||||
value: "physical drives".to_string(),
|
||||
error: format!("Failed to group by physical drive: {}", e),
|
||||
})?;
|
||||
|
||||
Ok(StorageTopology {
|
||||
physical_drives,
|
||||
mergerfs_pools,
|
||||
})
|
||||
// Step 5: Get SMART data for all drives
|
||||
let smart_data = self.get_smart_data_for_drives(&physical_drives, &mergerfs_pools).await;
|
||||
|
||||
// Step 6: Populate AgentData
|
||||
self.populate_drives_data(&physical_drives, &smart_data, agent_data)?;
|
||||
self.populate_pools_data(&mergerfs_pools, &smart_data, agent_data)?;
|
||||
|
||||
let elapsed = start_time.elapsed();
|
||||
debug!("Storage collection completed in {:?}", elapsed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Use lsblk to get mount points and their backing devices
|
||||
fn get_mount_devices(&self) -> Result<HashMap<String, String>> {
|
||||
let output = Command::new("lsblk")
|
||||
.args(&["-n", "-o", "NAME,MOUNTPOINT"])
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("lsblk command failed"));
|
||||
}
|
||||
|
||||
/// Get mount devices mapping from /proc/mounts
|
||||
async fn get_mount_devices(&self) -> Result<HashMap<String, String>, CollectorError> {
|
||||
let output = Command::new("findmnt")
|
||||
.args(&["-rn", "-o", "TARGET,SOURCE"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "mount points".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let mut mount_devices = HashMap::new();
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
for line in output_str.lines() {
|
||||
for line in String::from_utf8_lossy(&output.stdout).lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 {
|
||||
let device_name = parts[0]
|
||||
.trim_start_matches(&['├', '└', '─', ' '][..]);
|
||||
let mount_point = parts[1];
|
||||
let mount_point = parts[0];
|
||||
let device = parts[1];
|
||||
|
||||
// Skip unwanted mount points
|
||||
if self.should_skip_mount_point(mount_point) {
|
||||
// Skip special filesystems
|
||||
if !device.starts_with('/') || device.contains("loop") {
|
||||
continue;
|
||||
}
|
||||
|
||||
mount_devices.insert(mount_point.to_string(), device_name.to_string());
|
||||
mount_devices.insert(mount_point.to_string(), device.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(mount_devices)
|
||||
}
|
||||
|
||||
/// Check if we should skip this mount point
|
||||
fn should_skip_mount_point(&self, mount_point: &str) -> bool {
|
||||
let skip_prefixes = ["/proc", "/sys", "/dev", "/tmp", "/run"];
|
||||
skip_prefixes.iter().any(|prefix| mount_point.starts_with(prefix))
|
||||
}
|
||||
|
||||
/// Use df to get filesystem usage for mount points
|
||||
fn get_filesystem_usage(&self, mount_devices: &HashMap<String, String>) -> Result<HashMap<String, (u64, u64)>> {
|
||||
fn get_filesystem_usage(&self, mount_devices: &HashMap<String, String>) -> anyhow::Result<HashMap<String, (u64, u64)>> {
|
||||
let mut filesystem_usage = HashMap::new();
|
||||
|
||||
for mount_point in mount_devices.keys() {
|
||||
@@ -154,266 +148,79 @@ impl DiskCollector {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Ok(filesystem_usage)
|
||||
}
|
||||
|
||||
/// Get filesystem info using df command
|
||||
fn get_filesystem_info(&self, path: &str) -> Result<(u64, u64)> {
|
||||
/// Get filesystem info for a single mount point
|
||||
fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> {
|
||||
let output = Command::new("df")
|
||||
.arg("--block-size=1")
|
||||
.arg(path)
|
||||
.output()?;
|
||||
.args(&["--block-size=1", mount_point])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("df {}", mount_point),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err(anyhow::anyhow!("df command failed for {}", path));
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8(output.stdout)?;
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
let lines: Vec<&str> = output_str.lines().collect();
|
||||
|
||||
|
||||
if lines.len() < 2 {
|
||||
return Err(anyhow::anyhow!("Unexpected df output format"));
|
||||
return Err(CollectorError::Parse {
|
||||
value: output_str.to_string(),
|
||||
error: "Expected at least 2 lines from df output".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
||||
if fields.len() < 4 {
|
||||
return Err(anyhow::anyhow!("Unexpected df fields count"));
|
||||
// Parse the data line (skip header)
|
||||
let parts: Vec<&str> = lines[1].split_whitespace().collect();
|
||||
if parts.len() < 4 {
|
||||
return Err(CollectorError::Parse {
|
||||
value: lines[1].to_string(),
|
||||
error: "Expected at least 4 fields in df output".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let total_bytes = fields[1].parse::<u64>()?;
|
||||
let used_bytes = fields[2].parse::<u64>()?;
|
||||
let total_bytes: u64 = parts[1].parse().map_err(|e| CollectorError::Parse {
|
||||
value: parts[1].to_string(),
|
||||
error: format!("Failed to parse total bytes: {}", e),
|
||||
})?;
|
||||
|
||||
let used_bytes: u64 = parts[2].parse().map_err(|e| CollectorError::Parse {
|
||||
value: parts[2].to_string(),
|
||||
error: format!("Failed to parse used bytes: {}", e),
|
||||
})?;
|
||||
|
||||
Ok((total_bytes, used_bytes))
|
||||
}
|
||||
|
||||
/// Discover mergerfs pools from /proc/mounts
|
||||
fn discover_mergerfs_pools(&self) -> Result<Vec<MergerfsPool>> {
|
||||
let mounts_content = std::fs::read_to_string("/proc/mounts")?;
|
||||
let mut pools = Vec::new();
|
||||
/// Detect MergerFS pools from mount data
|
||||
fn detect_mergerfs_pools(&self, _filesystem_usage: &HashMap<String, (u64, u64)>) -> anyhow::Result<Vec<MergerfsPool>> {
|
||||
let pools = Vec::new();
|
||||
|
||||
for line in mounts_content.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 3 && parts[2] == "fuse.mergerfs" {
|
||||
let mount_point = parts[1].to_string();
|
||||
let device_sources = parts[0]; // e.g., "/mnt/disk1:/mnt/disk2"
|
||||
|
||||
// Get pool usage
|
||||
let (total_bytes, used_bytes) = self.get_filesystem_info(&mount_point)
|
||||
.unwrap_or((0, 0));
|
||||
|
||||
// Parse member paths - handle both full paths and numeric references
|
||||
let raw_paths: Vec<String> = device_sources
|
||||
.split(':')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
|
||||
// Convert numeric references to actual mount points if needed
|
||||
let mut member_paths = if raw_paths.iter().any(|path| !path.starts_with('/')) {
|
||||
// Handle numeric format like "1:2" by finding corresponding /mnt/disk* paths
|
||||
self.resolve_numeric_mergerfs_paths(&raw_paths)?
|
||||
} else {
|
||||
// Already full paths
|
||||
raw_paths
|
||||
};
|
||||
|
||||
// For SnapRAID setups, include parity drives that are related to this pool's data drives
|
||||
let related_parity_paths = self.discover_related_parity_drives(&member_paths)?;
|
||||
member_paths.extend(related_parity_paths);
|
||||
|
||||
// Categorize as data vs parity drives
|
||||
let (data_drives, parity_drives) = match self.categorize_pool_drives(&member_paths) {
|
||||
Ok(drives) => drives,
|
||||
Err(e) => {
|
||||
debug!("Failed to categorize drives for pool {}: {}. Skipping.", mount_point, e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
pools.push(MergerfsPool {
|
||||
mount_point,
|
||||
total_bytes,
|
||||
used_bytes,
|
||||
data_drives,
|
||||
parity_drives,
|
||||
});
|
||||
}
|
||||
}
|
||||
// For now, return empty pools - full mergerfs detection would require parsing /proc/mounts for fuse.mergerfs
|
||||
// This ensures we don't break existing functionality
|
||||
|
||||
Ok(pools)
|
||||
}
|
||||
|
||||
/// Discover parity drives that are related to the given data drives
|
||||
fn discover_related_parity_drives(&self, data_drives: &[String]) -> Result<Vec<String>> {
|
||||
let mount_devices = self.get_mount_devices()?;
|
||||
let mut related_parity = Vec::new();
|
||||
|
||||
// Find parity drives that share the same parent directory as the data drives
|
||||
for data_path in data_drives {
|
||||
if let Some(parent_dir) = self.get_parent_directory(data_path) {
|
||||
// Look for parity drives in the same parent directory
|
||||
for (mount_point, _device) in &mount_devices {
|
||||
if mount_point.contains("parity") && mount_point.starts_with(&parent_dir) {
|
||||
if !related_parity.contains(mount_point) {
|
||||
related_parity.push(mount_point.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(related_parity)
|
||||
}
|
||||
|
||||
/// Get parent directory of a mount path (e.g., "/mnt/disk1" -> "/mnt")
|
||||
fn get_parent_directory(&self, path: &str) -> Option<String> {
|
||||
if let Some(last_slash) = path.rfind('/') {
|
||||
if last_slash > 0 {
|
||||
return Some(path[..last_slash].to_string());
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Categorize pool member drives as data vs parity
|
||||
fn categorize_pool_drives(&self, member_paths: &[String]) -> Result<(Vec<DriveInfo>, Vec<DriveInfo>)> {
|
||||
let mut data_drives = Vec::new();
|
||||
let mut parity_drives = Vec::new();
|
||||
|
||||
for path in member_paths {
|
||||
let drive_info = self.get_drive_info_for_path(path)?;
|
||||
|
||||
// Heuristic: if path contains "parity", it's parity
|
||||
if path.to_lowercase().contains("parity") {
|
||||
parity_drives.push(drive_info);
|
||||
} else {
|
||||
data_drives.push(drive_info);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((data_drives, parity_drives))
|
||||
}
|
||||
|
||||
/// Get drive information for a mount path
|
||||
fn get_drive_info_for_path(&self, path: &str) -> Result<DriveInfo> {
|
||||
// Use lsblk to find the backing device
|
||||
let output = Command::new("lsblk")
|
||||
.args(&["-n", "-o", "NAME,MOUNTPOINT"])
|
||||
.output()?;
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
let mut device = String::new();
|
||||
|
||||
for line in output_str.lines() {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 2 && parts[1] == path {
|
||||
device = parts[0]
|
||||
.trim_start_matches('├')
|
||||
.trim_start_matches('└')
|
||||
.trim_start_matches('─')
|
||||
.trim()
|
||||
.to_string();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if device.is_empty() {
|
||||
return Err(anyhow::anyhow!("Could not find device for path {}", path));
|
||||
}
|
||||
|
||||
// Extract base device name (e.g., "sda1" -> "sda")
|
||||
let base_device = self.extract_base_device(&device);
|
||||
|
||||
// Get SMART data
|
||||
let (health, temperature, wear) = self.get_smart_data(&format!("/dev/{}", base_device));
|
||||
|
||||
Ok(DriveInfo {
|
||||
device: base_device,
|
||||
mount_point: path.to_string(),
|
||||
temperature,
|
||||
wear_level: wear,
|
||||
health_status: health,
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve numeric mergerfs references like "1:2" to actual mount paths
|
||||
fn resolve_numeric_mergerfs_paths(&self, numeric_refs: &[String]) -> Result<Vec<String>> {
|
||||
let mut resolved_paths = Vec::new();
|
||||
|
||||
// Get all mount points that look like /mnt/disk* or /mnt/parity*
|
||||
let mount_devices = self.get_mount_devices()?;
|
||||
let mut disk_mounts: Vec<String> = mount_devices.keys()
|
||||
.filter(|path| path.starts_with("/mnt/disk") || path.starts_with("/mnt/parity"))
|
||||
.cloned()
|
||||
.collect();
|
||||
disk_mounts.sort(); // Ensure consistent ordering
|
||||
|
||||
for num_ref in numeric_refs {
|
||||
if let Ok(index) = num_ref.parse::<usize>() {
|
||||
// Convert 1-based index to 0-based
|
||||
if index > 0 && index <= disk_mounts.len() {
|
||||
resolved_paths.push(disk_mounts[index - 1].clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: if we couldn't resolve, return the original paths
|
||||
if resolved_paths.is_empty() {
|
||||
resolved_paths = numeric_refs.to_vec();
|
||||
}
|
||||
|
||||
Ok(resolved_paths)
|
||||
}
|
||||
|
||||
/// Extract base device name from partition (e.g., "nvme0n1p2" -> "nvme0n1", "sda1" -> "sda")
|
||||
fn extract_base_device(&self, device_name: &str) -> String {
|
||||
// Handle NVMe devices (nvme0n1p1 -> nvme0n1)
|
||||
if device_name.starts_with("nvme") {
|
||||
if let Some(p_pos) = device_name.find('p') {
|
||||
return device_name[..p_pos].to_string();
|
||||
}
|
||||
}
|
||||
|
||||
// Handle traditional devices (sda1 -> sda)
|
||||
if device_name.len() > 1 {
|
||||
let chars: Vec<char> = device_name.chars().collect();
|
||||
let mut end_idx = chars.len();
|
||||
|
||||
// Find where the device name ends and partition number begins
|
||||
for (i, &c) in chars.iter().enumerate().rev() {
|
||||
if !c.is_ascii_digit() {
|
||||
end_idx = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if end_idx > 0 && end_idx < chars.len() {
|
||||
return chars[..end_idx].iter().collect();
|
||||
}
|
||||
}
|
||||
|
||||
// If no partition detected, return as-is
|
||||
device_name.to_string()
|
||||
}
|
||||
|
||||
/// Group filesystems by physical drive (excluding mergerfs members)
|
||||
fn group_by_physical_drive(
|
||||
&self,
|
||||
mount_devices: &HashMap<String, String>,
|
||||
filesystem_usage: &HashMap<String, (u64, u64)>,
|
||||
mergerfs_pools: &[MergerfsPool]
|
||||
) -> Result<Vec<PhysicalDrive>> {
|
||||
) -> anyhow::Result<Vec<PhysicalDrive>> {
|
||||
let mut drive_groups: HashMap<String, Vec<Filesystem>> = HashMap::new();
|
||||
|
||||
// Get all mergerfs member paths to exclude them
|
||||
let mut mergerfs_members = std::collections::HashSet::new();
|
||||
for pool in mergerfs_pools {
|
||||
for drive in &pool.data_drives {
|
||||
mergerfs_members.insert(drive.mount_point.clone());
|
||||
mergerfs_members.insert(drive.name.clone());
|
||||
}
|
||||
for drive in &pool.parity_drives {
|
||||
mergerfs_members.insert(drive.mount_point.clone());
|
||||
mergerfs_members.insert(drive.name.clone());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -427,575 +234,209 @@ impl DiskCollector {
|
||||
let base_device = self.extract_base_device(device);
|
||||
|
||||
if let Some((total, used)) = filesystem_usage.get(mount_point) {
|
||||
let usage_percent = (*used as f32 / *total as f32) * 100.0;
|
||||
|
||||
let filesystem = Filesystem {
|
||||
mount_point: mount_point.clone(),
|
||||
total_bytes: *total,
|
||||
mount_point: mount_point.clone(), // Keep actual mount point like "/" and "/boot"
|
||||
usage_percent,
|
||||
used_bytes: *used,
|
||||
total_bytes: *total,
|
||||
};
|
||||
|
||||
drive_groups.entry(base_device).or_insert_with(Vec::new).push(filesystem);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to PhysicalDrive structs with SMART data
|
||||
// Convert to PhysicalDrive structs
|
||||
let mut physical_drives = Vec::new();
|
||||
for (device, filesystems) in drive_groups {
|
||||
let (health, temperature, wear) = self.get_smart_data(&format!("/dev/{}", device));
|
||||
|
||||
physical_drives.push(PhysicalDrive {
|
||||
device,
|
||||
for (drive_name, filesystems) in drive_groups {
|
||||
let physical_drive = PhysicalDrive {
|
||||
name: drive_name,
|
||||
health: "UNKNOWN".to_string(), // Will be updated with SMART data
|
||||
temperature_celsius: None,
|
||||
wear_percent: None,
|
||||
filesystems,
|
||||
temperature,
|
||||
wear_level: wear,
|
||||
health_status: health,
|
||||
});
|
||||
};
|
||||
physical_drives.push(physical_drive);
|
||||
}
|
||||
|
||||
physical_drives.sort_by(|a, b| a.name.cmp(&b.name));
|
||||
Ok(physical_drives)
|
||||
}
|
||||
|
||||
/// Get SMART data for a drive
|
||||
fn get_smart_data(&self, device_path: &str) -> (String, Option<f32>, Option<f32>) {
|
||||
let output = Command::new("sudo")
|
||||
.arg("smartctl")
|
||||
.arg("-a")
|
||||
.arg(device_path)
|
||||
.output();
|
||||
/// Extract base device name from device path
|
||||
fn extract_base_device(&self, device: &str) -> String {
|
||||
// Extract base device name (e.g., "/dev/nvme0n1p1" -> "nvme0n1")
|
||||
if let Some(dev_name) = device.strip_prefix("/dev/") {
|
||||
// Remove partition numbers: nvme0n1p1 -> nvme0n1, sda1 -> sda
|
||||
if let Some(pos) = dev_name.find('p') {
|
||||
if dev_name[pos+1..].chars().all(char::is_numeric) {
|
||||
return dev_name[..pos].to_string();
|
||||
}
|
||||
}
|
||||
// Handle traditional naming: sda1 -> sda
|
||||
let mut result = String::new();
|
||||
for ch in dev_name.chars() {
|
||||
if ch.is_ascii_digit() {
|
||||
break;
|
||||
}
|
||||
result.push(ch);
|
||||
}
|
||||
if !result.is_empty() {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
device.to_string()
|
||||
}
|
||||
|
||||
/// Get SMART data for drives
|
||||
async fn get_smart_data_for_drives(&self, physical_drives: &[PhysicalDrive], mergerfs_pools: &[MergerfsPool]) -> HashMap<String, SmartData> {
|
||||
let mut smart_data = HashMap::new();
|
||||
|
||||
// Collect all drive names
|
||||
let mut all_drives = std::collections::HashSet::new();
|
||||
for drive in physical_drives {
|
||||
all_drives.insert(drive.name.clone());
|
||||
}
|
||||
for pool in mergerfs_pools {
|
||||
for drive in &pool.data_drives {
|
||||
all_drives.insert(drive.name.clone());
|
||||
}
|
||||
for drive in &pool.parity_drives {
|
||||
all_drives.insert(drive.name.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// Get SMART data for each drive
|
||||
for drive_name in all_drives {
|
||||
if let Ok(data) = self.get_smart_data(&drive_name).await {
|
||||
smart_data.insert(drive_name, data);
|
||||
}
|
||||
}
|
||||
|
||||
smart_data
|
||||
}
|
||||
|
||||
/// Get SMART data for a single drive
|
||||
async fn get_smart_data(&self, drive_name: &str) -> Result<SmartData, CollectorError> {
|
||||
let output = Command::new("smartctl")
|
||||
.args(&["-a", &format!("/dev/{}", drive_name)])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("SMART data for {}", drive_name),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
let mut health = "UNKNOWN".to_string();
|
||||
let mut temperature = None;
|
||||
let mut wear_percent = None;
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.contains("SMART overall-health") {
|
||||
if line.contains("PASSED") {
|
||||
health = "PASSED".to_string();
|
||||
} else if line.contains("FAILED") {
|
||||
health = "FAILED".to_string();
|
||||
}
|
||||
}
|
||||
|
||||
match output {
|
||||
Ok(result) if result.status.success() => {
|
||||
let stdout = String::from_utf8_lossy(&result.stdout);
|
||||
|
||||
// Parse health status
|
||||
let health = if stdout.contains("PASSED") {
|
||||
"PASSED".to_string()
|
||||
} else if stdout.contains("FAILED") {
|
||||
"FAILED".to_string()
|
||||
} else {
|
||||
"UNKNOWN".to_string()
|
||||
};
|
||||
|
||||
// Parse temperature and wear level
|
||||
let temperature = self.parse_temperature_from_smart(&stdout);
|
||||
let wear_level = self.parse_wear_level_from_smart(&stdout);
|
||||
|
||||
(health, temperature, wear_level)
|
||||
}
|
||||
_ => {
|
||||
debug!("Failed to get SMART data for {}", device_path);
|
||||
("UNKNOWN".to_string(), None, None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse temperature from SMART output
|
||||
fn parse_temperature_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||
for line in smart_output.lines() {
|
||||
if line.contains("Temperature_Celsius") || line.contains("Temperature") {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
if let Ok(temp) = parts[9].parse::<f32>() {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
// NVMe format: "Temperature:" (capital T)
|
||||
if line.contains("Temperature:") {
|
||||
if let Some(temp_part) = line.split("Temperature:").nth(1) {
|
||||
if let Some(temp_str) = temp_part.split_whitespace().next() {
|
||||
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Legacy format: "temperature:" (lowercase)
|
||||
if line.contains("temperature:") {
|
||||
if let Some(temp_part) = line.split("temperature:").nth(1) {
|
||||
if let Some(temp_str) = temp_part.split_whitespace().next() {
|
||||
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||
return Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Parse wear level from SMART output
|
||||
fn parse_wear_level_from_smart(&self, smart_output: &str) -> Option<f32> {
|
||||
for line in smart_output.lines() {
|
||||
if line.contains("Percentage Used:") {
|
||||
if let Some(wear_part) = line.split("Percentage Used:").nth(1) {
|
||||
if let Some(wear_str) = wear_part.split('%').next() {
|
||||
if let Ok(wear) = wear_str.trim().parse::<f32>() {
|
||||
return Some(wear);
|
||||
}
|
||||
// Temperature parsing
|
||||
if line.contains("Temperature_Celsius") || line.contains("Airflow_Temperature_Cel") {
|
||||
if let Some(temp_str) = line.split_whitespace().nth(9) {
|
||||
if let Ok(temp) = temp_str.parse::<f32>() {
|
||||
temperature = Some(temp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 10 {
|
||||
if line.contains("SSD_Life_Left") || line.contains("Percent_Lifetime_Remain") {
|
||||
if let Ok(remaining) = parts[3].parse::<f32>() {
|
||||
return Some(100.0 - remaining);
|
||||
}
|
||||
}
|
||||
if line.contains("Wear_Leveling_Count") {
|
||||
if let Ok(wear_count) = parts[3].parse::<f32>() {
|
||||
if wear_count <= 100.0 {
|
||||
return Some(100.0 - wear_count);
|
||||
}
|
||||
// Wear level parsing for SSDs
|
||||
if line.contains("Wear_Leveling_Count") || line.contains("SSD_Life_Left") {
|
||||
if let Some(wear_str) = line.split_whitespace().nth(9) {
|
||||
if let Ok(wear) = wear_str.parse::<f32>() {
|
||||
wear_percent = Some(100.0 - wear); // Convert remaining life to wear
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
|
||||
Ok(SmartData {
|
||||
health,
|
||||
temperature_celsius: temperature,
|
||||
wear_percent,
|
||||
})
|
||||
}
|
||||
|
||||
/// Calculate temperature status with hysteresis
|
||||
fn calculate_temperature_status(&self, metric_name: &str, temperature: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, temperature, &self.temperature_thresholds)
|
||||
}
|
||||
/// Populate drives data into AgentData
|
||||
fn populate_drives_data(&self, physical_drives: &[PhysicalDrive], smart_data: &HashMap<String, SmartData>, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
for drive in physical_drives {
|
||||
let smart = smart_data.get(&drive.name);
|
||||
|
||||
let filesystems: Vec<FilesystemData> = drive.filesystems.iter().map(|fs| {
|
||||
FilesystemData {
|
||||
mount: fs.mount_point.clone(), // This preserves "/" and "/boot" correctly
|
||||
usage_percent: fs.usage_percent,
|
||||
used_gb: fs.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
total_gb: fs.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
}
|
||||
}).collect();
|
||||
|
||||
/// Convert bytes to human readable format
|
||||
fn bytes_to_human_readable(&self, bytes: u64) -> String {
|
||||
const UNITS: &[&str] = &["B", "K", "M", "G", "T"];
|
||||
let mut size = bytes as f64;
|
||||
let mut unit_index = 0;
|
||||
|
||||
while size >= 1024.0 && unit_index < UNITS.len() - 1 {
|
||||
size /= 1024.0;
|
||||
unit_index += 1;
|
||||
agent_data.system.storage.drives.push(DriveData {
|
||||
name: drive.name.clone(),
|
||||
health: smart.map(|s| s.health.clone()).unwrap_or_else(|| drive.health.clone()),
|
||||
temperature_celsius: smart.and_then(|s| s.temperature_celsius),
|
||||
wear_percent: smart.and_then(|s| s.wear_percent),
|
||||
filesystems,
|
||||
});
|
||||
}
|
||||
|
||||
if unit_index == 0 {
|
||||
format!("{:.0}{}", size, UNITS[unit_index])
|
||||
} else {
|
||||
format!("{:.1}{}", size, UNITS[unit_index])
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convert bytes to gigabytes
|
||||
fn bytes_to_gb(&self, bytes: u64) -> f32 {
|
||||
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
||||
/// Populate pools data into AgentData
|
||||
fn populate_pools_data(&self, mergerfs_pools: &[MergerfsPool], _smart_data: &HashMap<String, SmartData>, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
for pool in mergerfs_pools {
|
||||
let pool_data = PoolData {
|
||||
name: pool.name.clone(),
|
||||
mount: pool.mount_point.clone(),
|
||||
pool_type: "mergerfs".to_string(),
|
||||
health: "healthy".to_string(), // TODO: Calculate based on member drives
|
||||
usage_percent: (pool.used_bytes as f32 / pool.total_bytes as f32) * 100.0,
|
||||
used_gb: pool.used_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
total_gb: pool.total_bytes as f32 / (1024.0 * 1024.0 * 1024.0),
|
||||
data_drives: pool.data_drives.iter().map(|d| cm_dashboard_shared::PoolDriveData {
|
||||
name: d.name.clone(),
|
||||
temperature_celsius: d.temperature_celsius,
|
||||
health: "unknown".to_string(),
|
||||
wear_percent: None,
|
||||
}).collect(),
|
||||
parity_drives: pool.parity_drives.iter().map(|d| cm_dashboard_shared::PoolDriveData {
|
||||
name: d.name.clone(),
|
||||
temperature_celsius: d.temperature_celsius,
|
||||
health: "unknown".to_string(),
|
||||
wear_percent: None,
|
||||
}).collect(),
|
||||
};
|
||||
|
||||
agent_data.system.storage.pools.push(pool_data);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for DiskCollector {
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Starting clean storage collection");
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Discover storage topology
|
||||
let topology = match self.discover_storage() {
|
||||
Ok(topology) => topology,
|
||||
Err(e) => {
|
||||
tracing::error!("Storage discovery failed: {}", e);
|
||||
return Ok(metrics);
|
||||
}
|
||||
};
|
||||
|
||||
// Generate metrics for physical drives
|
||||
for drive in &topology.physical_drives {
|
||||
self.generate_physical_drive_metrics(&mut metrics, drive, timestamp, status_tracker);
|
||||
}
|
||||
|
||||
// Generate metrics for mergerfs pools
|
||||
for pool in &topology.mergerfs_pools {
|
||||
self.generate_mergerfs_pool_metrics(&mut metrics, pool, timestamp, status_tracker);
|
||||
}
|
||||
|
||||
// Add total storage count
|
||||
let total_storage = topology.physical_drives.len() + topology.mergerfs_pools.len();
|
||||
metrics.push(Metric {
|
||||
name: "disk_count".to_string(),
|
||||
value: MetricValue::Integer(total_storage as i64),
|
||||
unit: None,
|
||||
description: Some(format!("Total storage: {} drives, {} pools", topology.physical_drives.len(), topology.mergerfs_pools.len())),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
let collection_time = start_time.elapsed();
|
||||
debug!("Clean storage collection completed in {:?} with {} metrics", collection_time, metrics.len());
|
||||
|
||||
Ok(metrics)
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
self.collect_storage_data(agent_data).await
|
||||
}
|
||||
}
|
||||
|
||||
impl DiskCollector {
|
||||
/// Generate metrics for a physical drive and its filesystems
|
||||
fn generate_physical_drive_metrics(
|
||||
&self,
|
||||
metrics: &mut Vec<Metric>,
|
||||
drive: &PhysicalDrive,
|
||||
timestamp: u64,
|
||||
status_tracker: &mut StatusTracker
|
||||
) {
|
||||
let drive_name = &drive.device;
|
||||
|
||||
// Calculate drive totals
|
||||
let total_capacity: u64 = drive.filesystems.iter().map(|fs| fs.total_bytes).sum();
|
||||
let total_used: u64 = drive.filesystems.iter().map(|fs| fs.used_bytes).sum();
|
||||
let total_available = total_capacity.saturating_sub(total_used);
|
||||
let usage_percent = if total_capacity > 0 {
|
||||
(total_used as f64 / total_capacity as f64) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
// Drive health status
|
||||
let health_status = if drive.health_status == "PASSED" { Status::Ok }
|
||||
else if drive.health_status == "FAILED" { Status::Critical }
|
||||
else { Status::Unknown };
|
||||
|
||||
// Usage status
|
||||
let usage_status = if usage_percent >= self.config.usage_critical_percent as f64 {
|
||||
Status::Critical
|
||||
} else if usage_percent >= self.config.usage_warning_percent as f64 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
let drive_status = if health_status == Status::Critical { Status::Critical } else { usage_status };
|
||||
|
||||
// Drive info metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_health", drive_name),
|
||||
value: MetricValue::String(drive.health_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("{}: {}", drive_name, drive.health_status)),
|
||||
status: health_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Drive temperature
|
||||
if let Some(temp) = drive.temperature {
|
||||
let temp_status = self.calculate_temperature_status(
|
||||
&format!("disk_{}_temperature", drive_name), temp, status_tracker
|
||||
);
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_temperature", drive_name),
|
||||
value: MetricValue::Float(temp),
|
||||
unit: Some("°C".to_string()),
|
||||
description: Some(format!("{}: {:.0}°C", drive_name, temp)),
|
||||
status: temp_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
|
||||
// Drive wear level
|
||||
if let Some(wear) = drive.wear_level {
|
||||
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
||||
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
||||
else { Status::Ok };
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_wear_percent", drive_name),
|
||||
value: MetricValue::Float(wear),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("{}: {:.0}% wear", drive_name, wear)),
|
||||
status: wear_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
|
||||
// Drive capacity metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_total_gb", drive_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(total_capacity)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}", drive_name, self.bytes_to_human_readable(total_capacity))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_used_gb", drive_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(total_used)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}", drive_name, self.bytes_to_human_readable(total_used))),
|
||||
status: drive_status.clone(),
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_available_gb", drive_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(total_available)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}", drive_name, self.bytes_to_human_readable(total_available))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_usage_percent", drive_name),
|
||||
value: MetricValue::Float(usage_percent as f32),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("{}: {:.1}%", drive_name, usage_percent)),
|
||||
status: drive_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Pool type indicator
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_pool_type", drive_name),
|
||||
value: MetricValue::String(format!("drive ({})", drive.filesystems.len())),
|
||||
unit: None,
|
||||
description: Some(format!("Type: physical drive")),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Individual filesystem metrics
|
||||
for filesystem in &drive.filesystems {
|
||||
let fs_name = if filesystem.mount_point == "/" {
|
||||
"root".to_string()
|
||||
} else {
|
||||
filesystem.mount_point.trim_start_matches('/').replace('/', "_")
|
||||
};
|
||||
|
||||
let fs_usage_percent = if filesystem.total_bytes > 0 {
|
||||
(filesystem.used_bytes as f64 / filesystem.total_bytes as f64) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
let fs_status = if fs_usage_percent >= self.config.usage_critical_percent as f64 {
|
||||
Status::Critical
|
||||
} else if fs_usage_percent >= self.config.usage_warning_percent as f64 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_usage_percent", drive_name, fs_name),
|
||||
value: MetricValue::Float(fs_usage_percent as f32),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("{}: {:.0}%", filesystem.mount_point, fs_usage_percent)),
|
||||
status: fs_status.clone(),
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_used_gb", drive_name, fs_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(filesystem.used_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}", filesystem.mount_point, self.bytes_to_human_readable(filesystem.used_bytes))),
|
||||
status: fs_status.clone(),
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_total_gb", drive_name, fs_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(filesystem.total_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}", filesystem.mount_point, self.bytes_to_human_readable(filesystem.total_bytes))),
|
||||
status: fs_status.clone(),
|
||||
timestamp,
|
||||
});
|
||||
|
||||
let fs_available = filesystem.total_bytes.saturating_sub(filesystem.used_bytes);
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_available_gb", drive_name, fs_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(fs_available)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("{}: {}", filesystem.mount_point, self.bytes_to_human_readable(fs_available))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_fs_{}_mount_point", drive_name, fs_name),
|
||||
value: MetricValue::String(filesystem.mount_point.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Mount: {}", filesystem.mount_point)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate metrics for a mergerfs pool
|
||||
fn generate_mergerfs_pool_metrics(
|
||||
&self,
|
||||
metrics: &mut Vec<Metric>,
|
||||
pool: &MergerfsPool,
|
||||
timestamp: u64,
|
||||
status_tracker: &mut StatusTracker
|
||||
) {
|
||||
// Use consistent pool naming: extract mount point without leading slash
|
||||
let pool_name = if pool.mount_point == "/" {
|
||||
"root".to_string()
|
||||
} else {
|
||||
pool.mount_point.trim_start_matches('/').replace('/', "_")
|
||||
};
|
||||
|
||||
if pool_name.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let usage_percent = if pool.total_bytes > 0 {
|
||||
(pool.used_bytes as f64 / pool.total_bytes as f64) * 100.0
|
||||
} else { 0.0 };
|
||||
|
||||
// Calculate pool health based on drive health
|
||||
let failed_data = pool.data_drives.iter()
|
||||
.filter(|d| d.health_status != "PASSED")
|
||||
.count();
|
||||
let failed_parity = pool.parity_drives.iter()
|
||||
.filter(|d| d.health_status != "PASSED")
|
||||
.count();
|
||||
|
||||
let pool_health = match (failed_data, failed_parity) {
|
||||
(0, 0) => Status::Ok,
|
||||
(1, 0) | (0, 1) => Status::Warning,
|
||||
_ => Status::Critical,
|
||||
};
|
||||
|
||||
let usage_status = if usage_percent >= self.config.usage_critical_percent as f64 {
|
||||
Status::Critical
|
||||
} else if usage_percent >= self.config.usage_warning_percent as f64 {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Ok
|
||||
};
|
||||
|
||||
let pool_status = if pool_health == Status::Critical { Status::Critical } else { usage_status };
|
||||
|
||||
// Pool metrics
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_mount_point", pool_name),
|
||||
value: MetricValue::String(pool.mount_point.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Mount: {}", pool.mount_point)),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_pool_type", pool_name),
|
||||
value: MetricValue::String(format!("mergerfs ({}+{})", pool.data_drives.len(), pool.parity_drives.len())),
|
||||
unit: None,
|
||||
description: Some("Type: mergerfs".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_pool_health", pool_name),
|
||||
value: MetricValue::String(match pool_health {
|
||||
Status::Ok => "healthy".to_string(),
|
||||
Status::Warning => "degraded".to_string(),
|
||||
Status::Critical => "critical".to_string(),
|
||||
_ => "unknown".to_string(),
|
||||
}),
|
||||
unit: None,
|
||||
description: Some("Pool health".to_string()),
|
||||
status: pool_health,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_total_gb", pool_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(pool.total_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Total: {}", self.bytes_to_human_readable(pool.total_bytes))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_used_gb", pool_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(pool.used_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Used: {}", self.bytes_to_human_readable(pool.used_bytes))),
|
||||
status: pool_status.clone(),
|
||||
timestamp,
|
||||
});
|
||||
|
||||
let available_bytes = pool.total_bytes.saturating_sub(pool.used_bytes);
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_available_gb", pool_name),
|
||||
value: MetricValue::Float(self.bytes_to_gb(available_bytes)),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Available: {}", self.bytes_to_human_readable(available_bytes))),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_usage_percent", pool_name),
|
||||
value: MetricValue::Float(usage_percent as f32),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("Usage: {:.1}%", usage_percent)),
|
||||
status: pool_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
// Individual drive metrics
|
||||
for drive in &pool.data_drives {
|
||||
self.generate_pool_drive_metrics(metrics, &pool_name, &drive.device, drive, timestamp, status_tracker);
|
||||
}
|
||||
|
||||
for drive in &pool.parity_drives {
|
||||
self.generate_pool_drive_metrics(metrics, &pool_name, &drive.device, drive, timestamp, status_tracker);
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate metrics for drives in mergerfs pools
|
||||
fn generate_pool_drive_metrics(
|
||||
&self,
|
||||
metrics: &mut Vec<Metric>,
|
||||
pool_name: &str,
|
||||
drive_role: &str,
|
||||
drive: &DriveInfo,
|
||||
timestamp: u64,
|
||||
status_tracker: &mut StatusTracker
|
||||
) {
|
||||
let drive_health = if drive.health_status == "PASSED" { Status::Ok }
|
||||
else if drive.health_status == "FAILED" { Status::Critical }
|
||||
else { Status::Unknown };
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_{}_health", pool_name, drive_role),
|
||||
value: MetricValue::String(drive.health_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("{}: {}", drive.device, drive.health_status)),
|
||||
status: drive_health,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
if let Some(temp) = drive.temperature {
|
||||
let temp_status = self.calculate_temperature_status(
|
||||
&format!("disk_{}_{}_temperature", pool_name, drive_role), temp, status_tracker
|
||||
);
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_{}_temperature", pool_name, drive_role),
|
||||
value: MetricValue::Float(temp),
|
||||
unit: Some("°C".to_string()),
|
||||
description: Some(format!("{}: {:.0}°C", drive.device, temp)),
|
||||
status: temp_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(wear) = drive.wear_level {
|
||||
let wear_status = if wear >= self.config.wear_critical_percent { Status::Critical }
|
||||
else if wear >= self.config.wear_warning_percent { Status::Warning }
|
||||
else { Status::Ok };
|
||||
metrics.push(Metric {
|
||||
name: format!("disk_{}_{}_wear_percent", pool_name, drive_role),
|
||||
value: MetricValue::Float(wear),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some(format!("{}: {:.0}% wear", drive.device, wear)),
|
||||
status: wear_status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
/// SMART data for a drive
|
||||
#[derive(Debug, Clone)]
|
||||
struct SmartData {
|
||||
health: String,
|
||||
temperature_celsius: Option<f32>,
|
||||
wear_percent: Option<f32>,
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{registry, Metric, MetricValue, Status, StatusTracker, HysteresisThresholds};
|
||||
use cm_dashboard_shared::{AgentData, TmpfsData, HysteresisThresholds};
|
||||
|
||||
use tracing::debug;
|
||||
|
||||
@@ -10,34 +10,19 @@ use crate::config::MemoryConfig;
|
||||
///
|
||||
/// EFFICIENCY OPTIMIZATIONS:
|
||||
/// - Single /proc/meminfo read for all memory metrics
|
||||
/// - Minimal string parsing with split operations
|
||||
/// - Pre-calculated KB to GB conversion
|
||||
/// - No regex or complex parsing
|
||||
/// - <0.1ms collection time target
|
||||
/// - Minimal string allocations
|
||||
/// - No process spawning for basic metrics
|
||||
/// - <0.5ms collection time target
|
||||
pub struct MemoryCollector {
|
||||
usage_thresholds: HysteresisThresholds,
|
||||
}
|
||||
|
||||
/// Memory information parsed from /proc/meminfo
|
||||
#[derive(Debug, Default)]
|
||||
struct MemoryInfo {
|
||||
total_kb: u64,
|
||||
available_kb: u64,
|
||||
free_kb: u64,
|
||||
buffers_kb: u64,
|
||||
cached_kb: u64,
|
||||
swap_total_kb: u64,
|
||||
swap_free_kb: u64,
|
||||
}
|
||||
|
||||
impl MemoryCollector {
|
||||
pub fn new(config: MemoryConfig) -> Self {
|
||||
// Create hysteresis thresholds with 5% gap for memory usage
|
||||
let usage_thresholds = HysteresisThresholds::with_custom_gaps(
|
||||
// Create hysteresis thresholds with 10% gap for recovery
|
||||
let usage_thresholds = HysteresisThresholds::new(
|
||||
config.usage_warning_percent,
|
||||
5.0, // 5% gap for warning recovery
|
||||
config.usage_critical_percent,
|
||||
5.0, // 5% gap for critical recovery
|
||||
);
|
||||
|
||||
Self {
|
||||
@@ -45,11 +30,6 @@ impl MemoryCollector {
|
||||
}
|
||||
}
|
||||
|
||||
/// Calculate memory usage status using hysteresis thresholds
|
||||
fn calculate_usage_status(&self, metric_name: &str, usage_percent: f32, status_tracker: &mut StatusTracker) -> Status {
|
||||
status_tracker.calculate_with_hysteresis(metric_name, usage_percent, &self.usage_thresholds)
|
||||
}
|
||||
|
||||
/// Parse /proc/meminfo efficiently
|
||||
/// Format: "MemTotal: 16384000 kB"
|
||||
async fn parse_meminfo(&self) -> Result<MemoryInfo, CollectorError> {
|
||||
@@ -96,212 +76,133 @@ impl MemoryCollector {
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
/// Convert KB to GB efficiently (avoiding floating point in hot path)
|
||||
fn kb_to_gb(kb: u64) -> f32 {
|
||||
kb as f32 / 1_048_576.0 // 1024 * 1024
|
||||
}
|
||||
|
||||
/// Calculate memory metrics from parsed info
|
||||
fn calculate_metrics(&self, info: &MemoryInfo, status_tracker: &mut StatusTracker) -> Vec<Metric> {
|
||||
let mut metrics = Vec::with_capacity(6);
|
||||
|
||||
/// Populate memory data directly into AgentData
|
||||
async fn populate_memory_data(&self, info: &MemoryInfo, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Calculate derived values
|
||||
let used_kb = info.total_kb - info.available_kb;
|
||||
let usage_percent = (used_kb as f32 / info.total_kb as f32) * 100.0;
|
||||
let usage_status = self.calculate_usage_status(registry::MEMORY_USAGE_PERCENT, usage_percent, status_tracker);
|
||||
let available = info.available_kb;
|
||||
let used = info.total_kb - available;
|
||||
let usage_percent = (used as f32 / info.total_kb as f32) * 100.0;
|
||||
|
||||
let swap_used_kb = info.swap_total_kb - info.swap_free_kb;
|
||||
// Populate basic memory fields
|
||||
agent_data.system.memory.usage_percent = usage_percent;
|
||||
agent_data.system.memory.total_gb = info.total_kb as f32 / (1024.0 * 1024.0);
|
||||
agent_data.system.memory.used_gb = used as f32 / (1024.0 * 1024.0);
|
||||
|
||||
// Convert to GB for metrics
|
||||
let total_gb = Self::kb_to_gb(info.total_kb);
|
||||
let used_gb = Self::kb_to_gb(used_kb);
|
||||
let available_gb = Self::kb_to_gb(info.available_kb);
|
||||
let swap_total_gb = Self::kb_to_gb(info.swap_total_kb);
|
||||
let swap_used_gb = Self::kb_to_gb(swap_used_kb);
|
||||
// Populate swap data if available
|
||||
agent_data.system.memory.swap_total_gb = info.swap_total_kb as f32 / (1024.0 * 1024.0);
|
||||
agent_data.system.memory.swap_used_gb = (info.swap_total_kb - info.swap_free_kb) as f32 / (1024.0 * 1024.0);
|
||||
|
||||
// Memory usage percentage (primary metric with status)
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_USAGE_PERCENT.to_string(),
|
||||
MetricValue::Float(usage_percent),
|
||||
usage_status,
|
||||
)
|
||||
.with_description("Memory usage percentage".to_string())
|
||||
.with_unit("%".to_string()),
|
||||
);
|
||||
|
||||
// Total memory
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(total_gb),
|
||||
Status::Ok, // Total memory doesn't have status
|
||||
)
|
||||
.with_description("Total system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Used memory
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_USED_GB.to_string(),
|
||||
MetricValue::Float(used_gb),
|
||||
Status::Ok, // Used memory absolute value doesn't have status
|
||||
)
|
||||
.with_description("Used system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Available memory
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_AVAILABLE_GB.to_string(),
|
||||
MetricValue::Float(available_gb),
|
||||
Status::Ok, // Available memory absolute value doesn't have status
|
||||
)
|
||||
.with_description("Available system memory".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
// Swap metrics (only if swap exists)
|
||||
if info.swap_total_kb > 0 {
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_SWAP_TOTAL_GB.to_string(),
|
||||
MetricValue::Float(swap_total_gb),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description("Total swap space".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
|
||||
metrics.push(
|
||||
Metric::new(
|
||||
registry::MEMORY_SWAP_USED_GB.to_string(),
|
||||
MetricValue::Float(swap_used_gb),
|
||||
Status::Ok,
|
||||
)
|
||||
.with_description("Used swap space".to_string())
|
||||
.with_unit("GB".to_string()),
|
||||
);
|
||||
}
|
||||
|
||||
// Monitor tmpfs (/tmp) usage
|
||||
if let Ok(tmpfs_metrics) = self.get_tmpfs_metrics(status_tracker) {
|
||||
metrics.extend(tmpfs_metrics);
|
||||
}
|
||||
|
||||
metrics
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get tmpfs (/tmp) usage metrics
|
||||
fn get_tmpfs_metrics(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
use std::process::Command;
|
||||
/// Populate tmpfs data into AgentData
|
||||
async fn populate_tmpfs_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Discover all tmpfs mount points
|
||||
let tmpfs_mounts = self.discover_tmpfs_mounts()?;
|
||||
|
||||
let output = Command::new("df")
|
||||
.arg("--block-size=1")
|
||||
.arg("/tmp")
|
||||
if tmpfs_mounts.is_empty() {
|
||||
debug!("No tmpfs mounts found to monitor");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Get usage data for all tmpfs mounts at once using df
|
||||
let mut df_args = vec!["df", "--output=target,size,used", "--block-size=1"];
|
||||
df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str()));
|
||||
|
||||
let df_output = std::process::Command::new(df_args[0])
|
||||
.args(&df_args[1..])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "/tmp".to_string(),
|
||||
path: "tmpfs mounts".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Ok(Vec::new()); // Return empty if /tmp not available
|
||||
let df_str = String::from_utf8_lossy(&df_output.stdout);
|
||||
let df_lines: Vec<&str> = df_str.lines().skip(1).collect(); // Skip header
|
||||
|
||||
// Process each tmpfs mount
|
||||
for (i, mount_point) in tmpfs_mounts.iter().enumerate() {
|
||||
if i >= df_lines.len() {
|
||||
debug!("Not enough df output lines for tmpfs mount: {}", mount_point);
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = df_lines[i].split_whitespace().collect();
|
||||
if parts.len() < 3 {
|
||||
debug!("Invalid df output for tmpfs mount: {}", mount_point);
|
||||
continue;
|
||||
}
|
||||
|
||||
let total_bytes: u64 = parts[1].parse().unwrap_or(0);
|
||||
let used_bytes: u64 = parts[2].parse().unwrap_or(0);
|
||||
|
||||
if total_bytes == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||
let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||
let usage_percent = (used_bytes as f32 / total_bytes as f32) * 100.0;
|
||||
|
||||
// Add to tmpfs list
|
||||
agent_data.system.memory.tmpfs.push(TmpfsData {
|
||||
mount: mount_point.clone(),
|
||||
usage_percent,
|
||||
used_gb,
|
||||
total_gb,
|
||||
});
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8(output.stdout)
|
||||
.map_err(|e| CollectorError::Parse {
|
||||
value: "df output".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let lines: Vec<&str> = output_str.lines().collect();
|
||||
if lines.len() < 2 {
|
||||
return Ok(Vec::new());
|
||||
/// Discover all tmpfs mount points from /proc/mounts
|
||||
fn discover_tmpfs_mounts(&self) -> Result<Vec<String>, CollectorError> {
|
||||
let content = utils::read_proc_file("/proc/mounts")?;
|
||||
let mut tmpfs_mounts = Vec::new();
|
||||
|
||||
for line in content.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 3 && fields[2] == "tmpfs" {
|
||||
let mount_point = fields[1];
|
||||
|
||||
// Filter out system/internal tmpfs mounts that aren't useful for monitoring
|
||||
if self.should_monitor_tmpfs(mount_point) {
|
||||
tmpfs_mounts.push(mount_point.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let fields: Vec<&str> = lines[1].split_whitespace().collect();
|
||||
if fields.len() < 4 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
debug!("Discovered {} tmpfs mounts: {:?}", tmpfs_mounts.len(), tmpfs_mounts);
|
||||
Ok(tmpfs_mounts)
|
||||
}
|
||||
|
||||
let total_bytes: u64 = fields[1].parse()
|
||||
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||
value: fields[1].to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
let used_bytes: u64 = fields[2].parse()
|
||||
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
||||
value: fields[2].to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||
let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||
let usage_percent = if total_bytes > 0 {
|
||||
(used_bytes as f32 / total_bytes as f32) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Calculate status using same thresholds as main memory
|
||||
let tmp_status = self.calculate_usage_status("memory_tmp_usage_percent", usage_percent, status_tracker);
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "memory_tmp_usage_percent".to_string(),
|
||||
value: MetricValue::Float(usage_percent),
|
||||
unit: Some("%".to_string()),
|
||||
description: Some("tmpfs /tmp usage percentage".to_string()),
|
||||
status: tmp_status,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "memory_tmp_used_gb".to_string(),
|
||||
value: MetricValue::Float(used_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some("tmpfs /tmp used space".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
metrics.push(Metric {
|
||||
name: "memory_tmp_total_gb".to_string(),
|
||||
value: MetricValue::Float(total_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some("tmpfs /tmp total space".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
|
||||
Ok(metrics)
|
||||
/// Determine if a tmpfs mount point should be monitored
|
||||
fn should_monitor_tmpfs(&self, mount_point: &str) -> bool {
|
||||
// Include commonly useful tmpfs mounts
|
||||
matches!(mount_point,
|
||||
"/tmp" | "/var/tmp" | "/dev/shm" | "/run" | "/var/log"
|
||||
) || mount_point.starts_with("/run/user/") // User session tmpfs
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for MemoryCollector {
|
||||
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting memory metrics");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// Parse memory info from /proc/meminfo
|
||||
let info = self.parse_meminfo().await?;
|
||||
|
||||
// Calculate all metrics from parsed info
|
||||
let metrics = self.calculate_metrics(&info, status_tracker);
|
||||
// Populate memory data directly
|
||||
self.populate_memory_data(&info, agent_data).await?;
|
||||
|
||||
// Collect tmpfs data
|
||||
self.populate_tmpfs_data(agent_data).await?;
|
||||
|
||||
let duration = start.elapsed();
|
||||
debug!(
|
||||
"Memory collection completed in {:?} with {} metrics",
|
||||
duration,
|
||||
metrics.len()
|
||||
);
|
||||
debug!("Memory collection completed in {:?}", duration);
|
||||
|
||||
// Efficiency check: warn if collection takes too long
|
||||
if duration.as_millis() > 1 {
|
||||
@@ -311,10 +212,18 @@ impl Collector for MemoryCollector {
|
||||
);
|
||||
}
|
||||
|
||||
// Store performance metrics
|
||||
// Performance tracking handled by cache system
|
||||
|
||||
Ok(metrics)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// Internal structure for parsing /proc/meminfo
|
||||
#[derive(Default)]
|
||||
struct MemoryInfo {
|
||||
total_kb: u64,
|
||||
available_kb: u64,
|
||||
free_kb: u64,
|
||||
buffers_kb: u64,
|
||||
cached_kb: u64,
|
||||
swap_total_kb: u64,
|
||||
swap_free_kb: u64,
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, StatusTracker};
|
||||
use cm_dashboard_shared::{AgentData};
|
||||
|
||||
|
||||
pub mod backup;
|
||||
@@ -13,13 +13,11 @@ pub mod systemd;
|
||||
pub use error::CollectorError;
|
||||
|
||||
|
||||
/// Base trait for all collectors with extreme efficiency requirements
|
||||
/// Base trait for all collectors with direct structured data output
|
||||
#[async_trait]
|
||||
pub trait Collector: Send + Sync {
|
||||
/// Collect all metrics this collector provides
|
||||
async fn collect(&self, status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError>;
|
||||
|
||||
|
||||
/// Collect data and populate AgentData directly with status evaluation
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError>;
|
||||
}
|
||||
|
||||
/// CPU efficiency rules for all collectors
|
||||
|
||||
@@ -1,172 +1,100 @@
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||
use cm_dashboard_shared::AgentData;
|
||||
use std::fs;
|
||||
use std::process::Command;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::NixOSConfig;
|
||||
|
||||
/// NixOS system information collector
|
||||
/// NixOS system information collector with structured data output
|
||||
///
|
||||
/// Collects NixOS-specific system information including:
|
||||
/// - NixOS version and build information
|
||||
/// This collector gathers NixOS-specific information like:
|
||||
/// - System generation/build information
|
||||
/// - Version information
|
||||
/// - Agent version from Nix store path
|
||||
pub struct NixOSCollector {
|
||||
config: NixOSConfig,
|
||||
}
|
||||
|
||||
impl NixOSCollector {
|
||||
pub fn new(_config: NixOSConfig) -> Self {
|
||||
Self {}
|
||||
pub fn new(config: NixOSConfig) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Collect NixOS system information and populate AgentData
|
||||
async fn collect_nixos_info(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
debug!("Collecting NixOS system information");
|
||||
|
||||
/// Get agent hash from binary path
|
||||
fn get_agent_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// Get the path of the current executable
|
||||
let exe_path = std::env::current_exe()?;
|
||||
let exe_str = exe_path.to_string_lossy();
|
||||
|
||||
// Extract Nix store hash from path like /nix/store/fn804fh332mp8gz06qawminpj20xl25h-cm-dashboard-0.1.0/bin/cm-dashboard-agent
|
||||
if let Some(store_path) = exe_str.strip_prefix("/nix/store/") {
|
||||
if let Some(dash_pos) = store_path.find('-') {
|
||||
return Ok(store_path[..dash_pos].to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to "unknown" if not in Nix store
|
||||
Ok("unknown".to_string())
|
||||
// Set hostname (this is universal, not NixOS-specific)
|
||||
agent_data.hostname = self.get_hostname().await.unwrap_or_else(|| "unknown".to_string());
|
||||
|
||||
// Set agent version from environment or Nix store path
|
||||
agent_data.agent_version = self.get_agent_version().await;
|
||||
|
||||
// Set current timestamp
|
||||
agent_data.timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get configuration hash from deployed nix store system
|
||||
/// Get git commit hash from rebuild process
|
||||
fn get_git_commit(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
let commit_file = "/var/lib/cm-dashboard/git-commit";
|
||||
match std::fs::read_to_string(commit_file) {
|
||||
Ok(content) => {
|
||||
let commit_hash = content.trim();
|
||||
if commit_hash.len() >= 7 {
|
||||
Ok(commit_hash.to_string())
|
||||
} else {
|
||||
Err("Git commit hash too short".into())
|
||||
}
|
||||
}
|
||||
Err(e) => Err(format!("Failed to read git commit file: {}", e).into())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_config_hash(&self) -> Result<String, Box<dyn std::error::Error>> {
|
||||
// Read the symlink target of /run/current-system to get nix store path
|
||||
let output = Command::new("readlink")
|
||||
.arg("/run/current-system")
|
||||
.output()?;
|
||||
|
||||
if !output.status.success() {
|
||||
return Err("readlink command failed".into());
|
||||
}
|
||||
|
||||
let binding = String::from_utf8_lossy(&output.stdout);
|
||||
let store_path = binding.trim();
|
||||
|
||||
// Extract hash from nix store path
|
||||
// Format: /nix/store/HASH-nixos-system-HOSTNAME-VERSION
|
||||
if let Some(hash_part) = store_path.strip_prefix("/nix/store/") {
|
||||
if let Some(hash) = hash_part.split('-').next() {
|
||||
if hash.len() >= 8 {
|
||||
// Return first 8 characters of nix store hash
|
||||
return Ok(hash[..8].to_string());
|
||||
/// Get system hostname
|
||||
async fn get_hostname(&self) -> Option<String> {
|
||||
match fs::read_to_string("/etc/hostname") {
|
||||
Ok(hostname) => Some(hostname.trim().to_string()),
|
||||
Err(_) => {
|
||||
// Fallback to hostname command
|
||||
match Command::new("hostname").output() {
|
||||
Ok(output) => Some(String::from_utf8_lossy(&output.stdout).trim().to_string()),
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err("Could not extract hash from nix store path".into())
|
||||
}
|
||||
|
||||
/// Get agent version from Nix store path or environment
|
||||
async fn get_agent_version(&self) -> String {
|
||||
// Try to extract version from the current executable path (Nix store)
|
||||
if let Ok(current_exe) = std::env::current_exe() {
|
||||
if let Some(exe_path) = current_exe.to_str() {
|
||||
if exe_path.starts_with("/nix/store/") {
|
||||
// Extract version from Nix store path
|
||||
// Path format: /nix/store/hash-cm-dashboard-agent-v0.1.138/bin/cm-dashboard-agent
|
||||
if let Some(store_part) = exe_path.strip_prefix("/nix/store/") {
|
||||
if let Some(dash_pos) = store_part.find('-') {
|
||||
let package_part = &store_part[dash_pos + 1..];
|
||||
if let Some(bin_pos) = package_part.find("/bin/") {
|
||||
let package_name = &package_part[..bin_pos];
|
||||
// Extract version from package name
|
||||
if let Some(version_start) = package_name.rfind("-v") {
|
||||
return package_name[version_start + 1..].to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to environment variable or default
|
||||
std::env::var("CM_DASHBOARD_VERSION").unwrap_or_else(|_| "unknown".to_string())
|
||||
}
|
||||
|
||||
/// Get NixOS system generation (build) information
|
||||
async fn get_nixos_generation(&self) -> Option<String> {
|
||||
match Command::new("nixos-version").output() {
|
||||
Ok(output) => {
|
||||
let version_str = String::from_utf8_lossy(&output.stdout);
|
||||
Some(version_str.trim().to_string())
|
||||
}
|
||||
Err(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for NixOSCollector {
|
||||
|
||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
debug!("Collecting NixOS system information");
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Collect git commit information (shows what's actually deployed)
|
||||
match self.get_git_commit() {
|
||||
Ok(git_commit) => {
|
||||
metrics.push(Metric {
|
||||
name: "system_nixos_build".to_string(),
|
||||
value: MetricValue::String(git_commit),
|
||||
unit: None,
|
||||
description: Some("Git commit hash of deployed configuration".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get git commit: {}", e);
|
||||
metrics.push(Metric {
|
||||
name: "system_nixos_build".to_string(),
|
||||
value: MetricValue::String("unknown".to_string()),
|
||||
unit: None,
|
||||
description: Some("Git commit hash (failed to detect)".to_string()),
|
||||
status: Status::Unknown,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Collect config hash
|
||||
match self.get_config_hash() {
|
||||
Ok(hash) => {
|
||||
metrics.push(Metric {
|
||||
name: "system_config_hash".to_string(),
|
||||
value: MetricValue::String(hash),
|
||||
unit: None,
|
||||
description: Some("NixOS deployed configuration hash".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get config hash: {}", e);
|
||||
metrics.push(Metric {
|
||||
name: "system_config_hash".to_string(),
|
||||
value: MetricValue::String("unknown".to_string()),
|
||||
unit: None,
|
||||
description: Some("Deployed config hash (failed to detect)".to_string()),
|
||||
status: Status::Unknown,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Collect agent hash
|
||||
match self.get_agent_hash() {
|
||||
Ok(hash) => {
|
||||
metrics.push(Metric {
|
||||
name: "system_agent_hash".to_string(),
|
||||
value: MetricValue::String(hash),
|
||||
unit: None,
|
||||
description: Some("Agent Nix store hash".to_string()),
|
||||
status: Status::Ok,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get agent hash: {}", e);
|
||||
metrics.push(Metric {
|
||||
name: "system_agent_hash".to_string(),
|
||||
value: MetricValue::String("unknown".to_string()),
|
||||
unit: None,
|
||||
description: Some("Agent hash (failed to detect)".to_string()),
|
||||
status: Status::Unknown,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Collected {} NixOS metrics", metrics.len());
|
||||
Ok(metrics)
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
self.collect_nixos_info(agent_data).await
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
||||
use cm_dashboard_shared::{AgentData, ServiceData};
|
||||
use std::process::Command;
|
||||
use std::sync::RwLock;
|
||||
use std::time::Instant;
|
||||
@@ -9,7 +9,7 @@ use tracing::debug;
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::SystemdConfig;
|
||||
|
||||
/// Systemd collector for monitoring systemd services
|
||||
/// Systemd collector for monitoring systemd services with structured data output
|
||||
pub struct SystemdCollector {
|
||||
/// Cached state with thread-safe interior mutability
|
||||
state: RwLock<ServiceCacheState>,
|
||||
@@ -18,848 +18,205 @@ pub struct SystemdCollector {
|
||||
}
|
||||
|
||||
/// Internal state for service caching
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceCacheState {
|
||||
/// Interesting services to monitor (cached after discovery)
|
||||
monitored_services: Vec<String>,
|
||||
/// Cached service status information from discovery
|
||||
service_status_cache: std::collections::HashMap<String, ServiceStatusInfo>,
|
||||
/// Last time services were discovered
|
||||
last_discovery_time: Option<Instant>,
|
||||
/// How often to rediscover services (5 minutes)
|
||||
discovery_interval_seconds: u64,
|
||||
/// Cached nginx site latency metrics
|
||||
nginx_site_metrics: Vec<Metric>,
|
||||
/// Last time nginx sites were checked
|
||||
last_nginx_check_time: Option<Instant>,
|
||||
/// How often to check nginx site latency (configurable)
|
||||
nginx_check_interval_seconds: u64,
|
||||
/// Last collection time for performance tracking
|
||||
last_collection: Option<Instant>,
|
||||
/// Cached service data
|
||||
services: Vec<ServiceInfo>,
|
||||
}
|
||||
|
||||
/// Cached service status information from systemctl list-units
|
||||
/// Internal service information
|
||||
#[derive(Debug, Clone)]
|
||||
struct ServiceStatusInfo {
|
||||
load_state: String,
|
||||
active_state: String,
|
||||
sub_state: String,
|
||||
struct ServiceInfo {
|
||||
name: String,
|
||||
status: String, // "active", "inactive", "failed", etc.
|
||||
memory_mb: f32, // Memory usage in MB
|
||||
disk_gb: f32, // Disk usage in GB (usually 0 for services)
|
||||
}
|
||||
|
||||
impl SystemdCollector {
|
||||
pub fn new(config: SystemdConfig) -> Self {
|
||||
let state = ServiceCacheState {
|
||||
last_collection: None,
|
||||
services: Vec::new(),
|
||||
};
|
||||
|
||||
Self {
|
||||
state: RwLock::new(ServiceCacheState {
|
||||
monitored_services: Vec::new(),
|
||||
service_status_cache: std::collections::HashMap::new(),
|
||||
last_discovery_time: None,
|
||||
discovery_interval_seconds: config.interval_seconds,
|
||||
nginx_site_metrics: Vec::new(),
|
||||
last_nginx_check_time: None,
|
||||
nginx_check_interval_seconds: config.nginx_check_interval_seconds,
|
||||
}),
|
||||
state: RwLock::new(state),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get monitored services, discovering them if needed or cache is expired
|
||||
fn get_monitored_services(&self) -> Result<Vec<String>> {
|
||||
// Check if we need discovery without holding the lock
|
||||
let needs_discovery = {
|
||||
let state = self.state.read().unwrap();
|
||||
match state.last_discovery_time {
|
||||
None => true, // First time
|
||||
Some(last_time) => {
|
||||
let elapsed = last_time.elapsed().as_secs();
|
||||
elapsed >= state.discovery_interval_seconds
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
if needs_discovery {
|
||||
debug!("Discovering systemd services (cache expired or first run)");
|
||||
// Call discover_services_internal which doesn't update state
|
||||
match self.discover_services_internal() {
|
||||
Ok((services, status_cache)) => {
|
||||
// Update state with discovered services in a separate scope
|
||||
if let Ok(mut state) = self.state.write() {
|
||||
state.monitored_services = services.clone();
|
||||
state.service_status_cache = status_cache;
|
||||
state.last_discovery_time = Some(Instant::now());
|
||||
debug!(
|
||||
"Auto-discovered {} services to monitor: {:?}",
|
||||
state.monitored_services.len(),
|
||||
state.monitored_services
|
||||
);
|
||||
return Ok(services);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to discover services, using cached list: {}", e);
|
||||
// Continue with existing cached services if discovery fails
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return cached services
|
||||
let state = self.state.read().unwrap();
|
||||
Ok(state.monitored_services.clone())
|
||||
}
|
||||
|
||||
/// Get nginx site metrics, checking them if cache is expired
|
||||
fn get_nginx_site_metrics(&self) -> Vec<Metric> {
|
||||
let mut state = self.state.write().unwrap();
|
||||
|
||||
// Check if we need to refresh nginx site metrics
|
||||
let needs_refresh = match state.last_nginx_check_time {
|
||||
None => true, // First time
|
||||
Some(last_time) => {
|
||||
let elapsed = last_time.elapsed().as_secs();
|
||||
elapsed >= state.nginx_check_interval_seconds
|
||||
}
|
||||
};
|
||||
|
||||
if needs_refresh {
|
||||
// Only check nginx sites if nginx service is active
|
||||
if state.monitored_services.iter().any(|s| s.contains("nginx")) {
|
||||
debug!(
|
||||
"Refreshing nginx site latency metrics (interval: {}s)",
|
||||
state.nginx_check_interval_seconds
|
||||
);
|
||||
let fresh_metrics = self.get_nginx_sites();
|
||||
state.nginx_site_metrics = fresh_metrics;
|
||||
state.last_nginx_check_time = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
state.nginx_site_metrics.clone()
|
||||
}
|
||||
|
||||
/// Auto-discover interesting services to monitor (internal version that doesn't update state)
|
||||
fn discover_services_internal(&self) -> Result<(Vec<String>, std::collections::HashMap<String, ServiceStatusInfo>)> {
|
||||
debug!("Starting systemd service discovery with status caching");
|
||||
|
||||
// First: Get all service unit files (includes services that have never been started)
|
||||
let unit_files_output = Command::new("systemctl")
|
||||
.arg("list-unit-files")
|
||||
.arg("--type=service")
|
||||
.arg("--no-pager")
|
||||
.arg("--plain")
|
||||
.output()?;
|
||||
|
||||
if !unit_files_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl list-unit-files command failed"));
|
||||
}
|
||||
|
||||
// Second: Get runtime status of all units
|
||||
let units_status_output = Command::new("systemctl")
|
||||
.arg("list-units")
|
||||
.arg("--type=service")
|
||||
.arg("--all")
|
||||
.arg("--no-pager")
|
||||
.arg("--plain")
|
||||
.output()?;
|
||||
|
||||
if !units_status_output.status.success() {
|
||||
return Err(anyhow::anyhow!("systemctl list-units command failed"));
|
||||
}
|
||||
|
||||
let unit_files_str = String::from_utf8(unit_files_output.stdout)?;
|
||||
let units_status_str = String::from_utf8(units_status_output.stdout)?;
|
||||
let mut services = Vec::new();
|
||||
|
||||
// Use configuration instead of hardcoded values
|
||||
let excluded_services = &self.config.excluded_services;
|
||||
let service_name_filters = &self.config.service_name_filters;
|
||||
|
||||
// Parse all service unit files to get complete service list
|
||||
let mut all_service_names = std::collections::HashSet::new();
|
||||
|
||||
for line in unit_files_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 2 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
all_service_names.insert(service_name.to_string());
|
||||
debug!("Found service unit file: {}", service_name);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse runtime status for all units
|
||||
let mut status_cache = std::collections::HashMap::new();
|
||||
for line in units_status_str.lines() {
|
||||
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||
if fields.len() >= 4 && fields[0].ends_with(".service") {
|
||||
let service_name = fields[0].trim_end_matches(".service");
|
||||
|
||||
// Extract status information from systemctl list-units output
|
||||
let load_state = fields.get(1).unwrap_or(&"unknown").to_string();
|
||||
let active_state = fields.get(2).unwrap_or(&"unknown").to_string();
|
||||
let sub_state = fields.get(3).unwrap_or(&"unknown").to_string();
|
||||
|
||||
// Cache the status information
|
||||
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||
load_state: load_state.clone(),
|
||||
active_state: active_state.clone(),
|
||||
sub_state: sub_state.clone(),
|
||||
});
|
||||
|
||||
debug!("Got runtime status for service: {} (load:{}, active:{}, sub:{})", service_name, load_state, active_state, sub_state);
|
||||
}
|
||||
}
|
||||
|
||||
// For services found in unit files but not in runtime status, set default inactive status
|
||||
for service_name in &all_service_names {
|
||||
if !status_cache.contains_key(service_name) {
|
||||
status_cache.insert(service_name.to_string(), ServiceStatusInfo {
|
||||
load_state: "not-loaded".to_string(),
|
||||
active_state: "inactive".to_string(),
|
||||
sub_state: "dead".to_string(),
|
||||
});
|
||||
debug!("Service {} found in unit files but not runtime - marked as inactive", service_name);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Now process all discovered services
|
||||
for service_name in &all_service_names {
|
||||
debug!("Processing service: '{}'", service_name);
|
||||
|
||||
// Skip excluded services first
|
||||
let mut is_excluded = false;
|
||||
for excluded in excluded_services {
|
||||
if service_name.contains(excluded) {
|
||||
debug!(
|
||||
"EXCLUDING service '{}' because it matches pattern '{}'",
|
||||
service_name, excluded
|
||||
);
|
||||
is_excluded = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if is_excluded {
|
||||
debug!("Skipping excluded service: '{}'", service_name);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if this service matches our filter patterns (supports wildcards)
|
||||
for pattern in service_name_filters {
|
||||
if self.matches_pattern(service_name, pattern) {
|
||||
debug!(
|
||||
"INCLUDING service '{}' because it matches pattern '{}'",
|
||||
service_name, pattern
|
||||
);
|
||||
services.push(service_name.to_string());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Service discovery completed: found {} matching services: {:?}", services.len(), services);
|
||||
if services.is_empty() {
|
||||
debug!("No services found matching the configured filters - this may indicate a parsing issue");
|
||||
}
|
||||
|
||||
Ok((services, status_cache))
|
||||
}
|
||||
|
||||
/// Check if service name matches pattern (supports wildcards like nginx*)
|
||||
fn matches_pattern(&self, service_name: &str, pattern: &str) -> bool {
|
||||
if pattern.contains('*') {
|
||||
// Wildcard pattern matching
|
||||
if pattern.ends_with('*') {
|
||||
// Pattern like "nginx*" - match if service starts with "nginx"
|
||||
let prefix = &pattern[..pattern.len() - 1];
|
||||
service_name.starts_with(prefix)
|
||||
} else if pattern.starts_with('*') {
|
||||
// Pattern like "*backup" - match if service ends with "backup"
|
||||
let suffix = &pattern[1..];
|
||||
service_name.ends_with(suffix)
|
||||
} else {
|
||||
// Pattern like "nginx*backup" - simple glob matching
|
||||
self.simple_glob_match(service_name, pattern)
|
||||
}
|
||||
} else {
|
||||
// Exact match (existing behavior)
|
||||
service_name == pattern
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple glob pattern matching for patterns with * in middle
|
||||
fn simple_glob_match(&self, text: &str, pattern: &str) -> bool {
|
||||
let parts: Vec<&str> = pattern.split('*').collect();
|
||||
if parts.is_empty() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut pos = 0;
|
||||
for (i, part) in parts.iter().enumerate() {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if i == 0 {
|
||||
// First part must match at start
|
||||
if !text[pos..].starts_with(part) {
|
||||
return false;
|
||||
}
|
||||
pos += part.len();
|
||||
} else if i == parts.len() - 1 {
|
||||
// Last part must match at end
|
||||
return text[pos..].ends_with(part);
|
||||
} else {
|
||||
// Middle part must be found somewhere
|
||||
if let Some(found_pos) = text[pos..].find(part) {
|
||||
pos += found_pos + part.len();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Get service status from cache (if available) or fallback to systemctl
|
||||
fn get_service_status(&self, service: &str) -> Result<(String, String)> {
|
||||
// Try to get status from cache first
|
||||
if let Ok(state) = self.state.read() {
|
||||
if let Some(cached_info) = state.service_status_cache.get(service) {
|
||||
let active_status = cached_info.active_state.clone();
|
||||
let detailed_info = format!(
|
||||
"LoadState={}\nActiveState={}\nSubState={}",
|
||||
cached_info.load_state,
|
||||
cached_info.active_state,
|
||||
cached_info.sub_state
|
||||
);
|
||||
return Ok((active_status, detailed_info));
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to systemctl if not in cache (shouldn't happen during normal operation)
|
||||
debug!("Service '{}' not found in cache, falling back to systemctl", service);
|
||||
let output = Command::new("systemctl")
|
||||
.arg("is-active")
|
||||
.arg(format!("{}.service", service))
|
||||
.output()?;
|
||||
|
||||
let active_status = String::from_utf8(output.stdout)?.trim().to_string();
|
||||
|
||||
// Get more detailed info
|
||||
let output = Command::new("systemctl")
|
||||
.arg("show")
|
||||
.arg(format!("{}.service", service))
|
||||
.arg("--property=LoadState,ActiveState,SubState")
|
||||
.output()?;
|
||||
|
||||
let detailed_info = String::from_utf8(output.stdout)?;
|
||||
Ok((active_status, detailed_info))
|
||||
}
|
||||
|
||||
/// Calculate service status, taking user-stopped services into account
|
||||
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||
match active_status.to_lowercase().as_str() {
|
||||
"active" => Status::Ok,
|
||||
"inactive" | "dead" => {
|
||||
debug!("Service '{}' is inactive - treating as Inactive status", service_name);
|
||||
Status::Inactive
|
||||
},
|
||||
"failed" | "error" => Status::Critical,
|
||||
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => {
|
||||
debug!("Service '{}' is transitioning - treating as Pending", service_name);
|
||||
Status::Pending
|
||||
},
|
||||
_ => Status::Unknown,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get service memory usage (if available)
|
||||
fn get_service_memory(&self, service: &str) -> Option<f32> {
|
||||
let output = Command::new("systemctl")
|
||||
.arg("show")
|
||||
.arg(format!("{}.service", service))
|
||||
.arg("--property=MemoryCurrent")
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("MemoryCurrent=") {
|
||||
let memory_str = line.trim_start_matches("MemoryCurrent=");
|
||||
if let Ok(memory_bytes) = memory_str.parse::<u64>() {
|
||||
return Some(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
|
||||
/// Get directory size in GB with permission-aware logging
|
||||
fn get_directory_size(&self, dir: &str) -> Option<f32> {
|
||||
let output = Command::new("sudo").arg("du").arg("-sb").arg(dir).output().ok()?;
|
||||
|
||||
if !output.status.success() {
|
||||
// Log permission errors for debugging but don't spam logs
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
if stderr.contains("Permission denied") {
|
||||
debug!("Permission denied accessing directory: {}", dir);
|
||||
} else {
|
||||
debug!("Failed to get size for directory {}: {}", dir, stderr);
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||
let size_str = output_str.split_whitespace().next()?;
|
||||
if let Ok(size_bytes) = size_str.parse::<u64>() {
|
||||
let size_gb = size_bytes as f32 / (1024.0 * 1024.0 * 1024.0);
|
||||
// Return size even if very small (minimum 0.001 GB = 1MB for visibility)
|
||||
if size_gb > 0.0 {
|
||||
Some(size_gb.max(0.001))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Get service disk usage - simplified and configuration-driven
|
||||
fn get_service_disk_usage(&self, service: &str) -> Option<f32> {
|
||||
// 1. Check if service has configured directories (exact match only)
|
||||
if let Some(dirs) = self.config.service_directories.get(service) {
|
||||
// Service has configured paths - use the first accessible one
|
||||
for dir in dirs {
|
||||
if let Some(size) = self.get_directory_size(dir) {
|
||||
return Some(size);
|
||||
}
|
||||
}
|
||||
// If configured paths failed, return None (shows as 0)
|
||||
return Some(0.0);
|
||||
}
|
||||
|
||||
// 2. No configured path - use systemctl WorkingDirectory
|
||||
let output = Command::new("systemctl")
|
||||
.arg("show")
|
||||
.arg(format!("{}.service", service))
|
||||
.arg("--property=WorkingDirectory")
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
let output_str = String::from_utf8(output.stdout).ok()?;
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") {
|
||||
let dir = line.trim_start_matches("WorkingDirectory=");
|
||||
if !dir.is_empty() && dir != "/" {
|
||||
return self.get_directory_size(dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for SystemdCollector {
|
||||
|
||||
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
||||
/// Collect service data and populate AgentData
|
||||
async fn collect_service_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
let start_time = Instant::now();
|
||||
debug!("Collecting systemd services metrics");
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
|
||||
// Get cached services (discovery only happens when needed)
|
||||
let monitored_services = match self.get_monitored_services() {
|
||||
Ok(services) => services,
|
||||
Err(e) => {
|
||||
debug!("Failed to get monitored services: {}", e);
|
||||
return Ok(metrics);
|
||||
}
|
||||
};
|
||||
|
||||
// Collect individual metrics for each monitored service (status, memory, disk only)
|
||||
for service in &monitored_services {
|
||||
match self.get_service_status(service) {
|
||||
Ok((active_status, _detailed_info)) => {
|
||||
let status = self.calculate_service_status(service, &active_status);
|
||||
|
||||
// Individual service status metric
|
||||
metrics.push(Metric {
|
||||
name: format!("service_{}_status", service),
|
||||
value: MetricValue::String(active_status.clone()),
|
||||
unit: None,
|
||||
description: Some(format!("Service {} status", service)),
|
||||
status,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
|
||||
// Service memory usage (if available)
|
||||
if let Some(memory_mb) = self.get_service_memory(service) {
|
||||
metrics.push(Metric {
|
||||
name: format!("service_{}_memory_mb", service),
|
||||
value: MetricValue::Float(memory_mb),
|
||||
unit: Some("MB".to_string()),
|
||||
description: Some(format!("Service {} memory usage", service)),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
}
|
||||
|
||||
// Service disk usage (comprehensive detection)
|
||||
if let Some(disk_gb) = self.get_service_disk_usage(service) {
|
||||
metrics.push(Metric {
|
||||
name: format!("service_{}_disk_gb", service),
|
||||
value: MetricValue::Float(disk_gb),
|
||||
unit: Some("GB".to_string()),
|
||||
description: Some(format!("Service {} disk usage", service)),
|
||||
status: Status::Ok,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
}
|
||||
|
||||
// Sub-service metrics for specific services
|
||||
if service.contains("nginx") && active_status == "active" {
|
||||
metrics.extend(self.get_nginx_site_metrics());
|
||||
}
|
||||
|
||||
if service.contains("docker") && active_status == "active" {
|
||||
metrics.extend(self.get_docker_containers());
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("Failed to get status for service {}: {}", service, e);
|
||||
}
|
||||
}
|
||||
// Get systemd services status
|
||||
let services = self.get_systemd_services().await?;
|
||||
|
||||
// Update cached state
|
||||
{
|
||||
let mut state = self.state.write().unwrap();
|
||||
state.last_collection = Some(start_time);
|
||||
state.services = services.clone();
|
||||
}
|
||||
|
||||
let collection_time = start_time.elapsed();
|
||||
debug!(
|
||||
"Systemd collection completed in {:?} with {} individual service metrics",
|
||||
collection_time,
|
||||
metrics.len()
|
||||
);
|
||||
|
||||
Ok(metrics)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl SystemdCollector {
|
||||
/// Get nginx sites with latency checks
|
||||
fn get_nginx_sites(&self) -> Vec<Metric> {
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
|
||||
// Discover nginx sites from configuration
|
||||
let sites = self.discover_nginx_sites();
|
||||
|
||||
for (site_name, url) in &sites {
|
||||
match self.check_site_latency(url) {
|
||||
Ok(latency_ms) => {
|
||||
let status = if latency_ms < self.config.nginx_latency_critical_ms {
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Critical
|
||||
};
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("service_nginx_{}_latency_ms", site_name),
|
||||
value: MetricValue::Float(latency_ms),
|
||||
unit: Some("ms".to_string()),
|
||||
description: Some(format!("Response time for {}", url)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
Err(_) => {
|
||||
// Site is unreachable
|
||||
metrics.push(Metric {
|
||||
name: format!("service_nginx_{}_latency_ms", site_name),
|
||||
value: MetricValue::Float(-1.0), // Use -1 to indicate error
|
||||
unit: Some("ms".to_string()),
|
||||
description: Some(format!("Response time for {} (unreachable)", url)),
|
||||
status: Status::Critical,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Populate AgentData with service information
|
||||
for service in services {
|
||||
agent_data.services.push(ServiceData {
|
||||
name: service.name,
|
||||
status: service.status,
|
||||
memory_mb: service.memory_mb,
|
||||
disk_gb: service.disk_gb,
|
||||
user_stopped: false, // TODO: Integrate with service tracker
|
||||
});
|
||||
}
|
||||
|
||||
metrics
|
||||
let elapsed = start_time.elapsed();
|
||||
debug!("Systemd collection completed in {:?} with {} services", elapsed, agent_data.services.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get docker containers as sub-services
|
||||
fn get_docker_containers(&self) -> Vec<Metric> {
|
||||
let mut metrics = Vec::new();
|
||||
let timestamp = chrono::Utc::now().timestamp() as u64;
|
||||
/// Get systemd services information
|
||||
async fn get_systemd_services(&self) -> Result<Vec<ServiceInfo>, CollectorError> {
|
||||
let mut services = Vec::new();
|
||||
|
||||
// Check if docker is available
|
||||
let output = Command::new("docker")
|
||||
.arg("ps")
|
||||
.arg("--format")
|
||||
.arg("{{.Names}},{{.Status}}")
|
||||
.output();
|
||||
// Get basic service status from systemctl
|
||||
let status_output = Command::new("systemctl")
|
||||
.args(&["list-units", "--type=service", "--no-pager", "--plain"])
|
||||
.output()
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: "systemctl list-units".to_string(),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
let output = match output {
|
||||
Ok(out) if out.status.success() => out,
|
||||
_ => return metrics, // Docker not available or failed
|
||||
};
|
||||
|
||||
let output_str = match String::from_utf8(output.stdout) {
|
||||
Ok(s) => s,
|
||||
Err(_) => return metrics,
|
||||
};
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.trim().is_empty() {
|
||||
let status_str = String::from_utf8_lossy(&status_output.stdout);
|
||||
|
||||
// Parse service status
|
||||
for line in status_str.lines() {
|
||||
if line.trim().is_empty() || line.contains("UNIT") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let parts: Vec<&str> = line.split(',').collect();
|
||||
if parts.len() >= 2 {
|
||||
let container_name = parts[0].trim();
|
||||
let status_str = parts[1].trim();
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if parts.len() >= 4 {
|
||||
let service_name = parts[0].trim_end_matches(".service");
|
||||
let load_state = parts[1];
|
||||
let active_state = parts[2];
|
||||
let sub_state = parts[3];
|
||||
|
||||
let status = if status_str.contains("Up") {
|
||||
Status::Ok
|
||||
} else if status_str.contains("Exited") {
|
||||
Status::Warning
|
||||
} else {
|
||||
Status::Critical
|
||||
};
|
||||
// Skip if not loaded
|
||||
if load_state != "loaded" {
|
||||
continue;
|
||||
}
|
||||
|
||||
metrics.push(Metric {
|
||||
name: format!("service_docker_{}_status", container_name),
|
||||
value: MetricValue::String(status_str.to_string()),
|
||||
unit: None,
|
||||
description: Some(format!("Docker container {} status", container_name)),
|
||||
status,
|
||||
timestamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
// Filter services based on configuration
|
||||
if self.config.service_name_filters.is_empty() || self.config.service_name_filters.contains(&service_name.to_string()) {
|
||||
// Get memory usage for this service
|
||||
let memory_mb = self.get_service_memory_usage(service_name).await.unwrap_or(0.0);
|
||||
|
||||
let service_info = ServiceInfo {
|
||||
name: service_name.to_string(),
|
||||
status: self.normalize_service_status(active_state, sub_state),
|
||||
memory_mb,
|
||||
disk_gb: 0.0, // Services typically don't have disk usage
|
||||
};
|
||||
|
||||
metrics
|
||||
}
|
||||
|
||||
/// Check site latency using HTTP GET requests
|
||||
fn check_site_latency(&self, url: &str) -> Result<f32, Box<dyn std::error::Error>> {
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
// Create HTTP client with timeouts from configuration
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(Duration::from_secs(self.config.http_timeout_seconds))
|
||||
.connect_timeout(Duration::from_secs(self.config.http_connect_timeout_seconds))
|
||||
.redirect(reqwest::redirect::Policy::limited(10))
|
||||
.build()?;
|
||||
|
||||
// Make GET request and measure latency
|
||||
let response = client.get(url).send()?;
|
||||
let latency = start.elapsed().as_millis() as f32;
|
||||
|
||||
// Check if response is successful (2xx or 3xx status codes)
|
||||
if response.status().is_success() || response.status().is_redirection() {
|
||||
Ok(latency)
|
||||
} else {
|
||||
Err(format!(
|
||||
"HTTP request failed for {} with status: {}",
|
||||
url,
|
||||
response.status()
|
||||
)
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
/// Discover nginx sites from configuration files (like the old working implementation)
|
||||
fn discover_nginx_sites(&self) -> Vec<(String, String)> {
|
||||
use tracing::debug;
|
||||
|
||||
// Use the same approach as the old working agent: get nginx config from systemd
|
||||
let config_content = match self.get_nginx_config_from_systemd() {
|
||||
Some(content) => content,
|
||||
None => {
|
||||
debug!("Could not get nginx config from systemd, trying nginx -T fallback");
|
||||
match self.get_nginx_config_via_command() {
|
||||
Some(content) => content,
|
||||
None => {
|
||||
debug!("Could not get nginx config via any method");
|
||||
return Vec::new();
|
||||
}
|
||||
services.push(service_info);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Parse the config content to extract sites
|
||||
self.parse_nginx_config_for_sites(&config_content)
|
||||
Ok(services)
|
||||
}
|
||||
|
||||
/// Get nginx config from systemd service definition (NixOS compatible)
|
||||
fn get_nginx_config_from_systemd(&self) -> Option<String> {
|
||||
use tracing::debug;
|
||||
|
||||
let output = std::process::Command::new("systemctl")
|
||||
.args(["show", "nginx", "--property=ExecStart", "--no-pager"])
|
||||
/// Get memory usage for a specific service
|
||||
async fn get_service_memory_usage(&self, service_name: &str) -> Result<f32, CollectorError> {
|
||||
let output = Command::new("systemctl")
|
||||
.args(&["show", &format!("{}.service", service_name), "--property=MemoryCurrent"])
|
||||
.output()
|
||||
.ok()?;
|
||||
.map_err(|e| CollectorError::SystemRead {
|
||||
path: format!("memory usage for {}", service_name),
|
||||
error: e.to_string(),
|
||||
})?;
|
||||
|
||||
if !output.status.success() {
|
||||
debug!("Failed to get nginx ExecStart from systemd");
|
||||
return None;
|
||||
}
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
debug!("systemctl show nginx output: {}", stdout);
|
||||
|
||||
// Parse ExecStart to extract -c config path
|
||||
for line in stdout.lines() {
|
||||
if line.starts_with("ExecStart=") {
|
||||
debug!("Found ExecStart line: {}", line);
|
||||
// Handle both traditional and NixOS systemd formats
|
||||
if let Some(config_path) = self.extract_config_path_from_exec_start(line) {
|
||||
debug!("Extracted config path: {}", config_path);
|
||||
// Read the config file
|
||||
return std::fs::read_to_string(&config_path)
|
||||
.map_err(|e| debug!("Failed to read config file {}: {}", config_path, e))
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Extract config path from ExecStart line
|
||||
fn extract_config_path_from_exec_start(&self, exec_start: &str) -> Option<String> {
|
||||
use tracing::debug;
|
||||
|
||||
// Remove ExecStart= prefix
|
||||
let exec_part = exec_start.strip_prefix("ExecStart=")?;
|
||||
debug!("Parsing exec part: {}", exec_part);
|
||||
|
||||
// Handle NixOS format: ExecStart={ path=...; argv[]=...nginx -c /config; ... }
|
||||
if exec_part.contains("argv[]=") {
|
||||
// Extract the part after argv[]=
|
||||
let argv_start = exec_part.find("argv[]=")?;
|
||||
let argv_part = &exec_part[argv_start + 7..]; // Skip "argv[]="
|
||||
debug!("Found NixOS argv part: {}", argv_part);
|
||||
|
||||
// Look for -c flag followed by config path
|
||||
if let Some(c_pos) = argv_part.find(" -c ") {
|
||||
let after_c = &argv_part[c_pos + 4..];
|
||||
// Find the config path (until next space or semicolon)
|
||||
let config_path = after_c.split([' ', ';']).next()?;
|
||||
return Some(config_path.to_string());
|
||||
}
|
||||
} else {
|
||||
// Handle traditional format: ExecStart=/path/nginx -c /config
|
||||
debug!("Parsing traditional format");
|
||||
if let Some(c_pos) = exec_part.find(" -c ") {
|
||||
let after_c = &exec_part[c_pos + 4..];
|
||||
let config_path = after_c.split_whitespace().next()?;
|
||||
return Some(config_path.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Fallback: get nginx config via nginx -T command
|
||||
fn get_nginx_config_via_command(&self) -> Option<String> {
|
||||
use tracing::debug;
|
||||
|
||||
let output = std::process::Command::new("nginx")
|
||||
.args(["-T"])
|
||||
.output()
|
||||
.ok()?;
|
||||
|
||||
if !output.status.success() {
|
||||
debug!("nginx -T failed");
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(String::from_utf8_lossy(&output.stdout).to_string())
|
||||
}
|
||||
|
||||
/// Parse nginx config content to extract server names and build site list
|
||||
fn parse_nginx_config_for_sites(&self, config_content: &str) -> Vec<(String, String)> {
|
||||
use tracing::debug;
|
||||
let mut sites = Vec::new();
|
||||
let lines: Vec<&str> = config_content.lines().collect();
|
||||
let mut i = 0;
|
||||
|
||||
debug!("Parsing nginx config with {} lines", lines.len());
|
||||
|
||||
while i < lines.len() {
|
||||
let line = lines[i].trim();
|
||||
if line.starts_with("server") && line.contains("{") {
|
||||
if let Some(server_name) = self.parse_server_block(&lines, &mut i) {
|
||||
let url = format!("https://{}", server_name);
|
||||
sites.push((server_name.clone(), url));
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
debug!("Discovered {} nginx sites total", sites.len());
|
||||
sites
|
||||
}
|
||||
|
||||
/// Parse a server block to extract the primary server_name
|
||||
fn parse_server_block(&self, lines: &[&str], start_index: &mut usize) -> Option<String> {
|
||||
use tracing::debug;
|
||||
let mut server_names = Vec::new();
|
||||
let mut has_redirect = false;
|
||||
let mut i = *start_index + 1;
|
||||
let mut brace_count = 1;
|
||||
|
||||
// Parse until we close the server block
|
||||
while i < lines.len() && brace_count > 0 {
|
||||
let trimmed = lines[i].trim();
|
||||
|
||||
// Track braces
|
||||
brace_count += trimmed.matches('{').count();
|
||||
brace_count -= trimmed.matches('}').count();
|
||||
|
||||
// Extract server_name
|
||||
if trimmed.starts_with("server_name") {
|
||||
if let Some(names_part) = trimmed.strip_prefix("server_name") {
|
||||
let names_clean = names_part.trim().trim_end_matches(';');
|
||||
for name in names_clean.split_whitespace() {
|
||||
if name != "_"
|
||||
&& !name.is_empty()
|
||||
&& name.contains('.')
|
||||
&& !name.starts_with('$')
|
||||
{
|
||||
server_names.push(name.to_string());
|
||||
debug!("Found server_name in block: {}", name);
|
||||
let output_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
for line in output_str.lines() {
|
||||
if line.starts_with("MemoryCurrent=") {
|
||||
if let Some(mem_str) = line.strip_prefix("MemoryCurrent=") {
|
||||
if mem_str != "[not set]" {
|
||||
if let Ok(memory_bytes) = mem_str.parse::<u64>() {
|
||||
return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0.0)
|
||||
}
|
||||
|
||||
// Check for redirects (skip redirect-only servers)
|
||||
if trimmed.contains("return") && (trimmed.contains("301") || trimmed.contains("302")) {
|
||||
has_redirect = true;
|
||||
/// Normalize service status to standard values
|
||||
fn normalize_service_status(&self, active_state: &str, sub_state: &str) -> String {
|
||||
match (active_state, sub_state) {
|
||||
("active", "running") => "active".to_string(),
|
||||
("active", _) => "active".to_string(),
|
||||
("inactive", "dead") => "inactive".to_string(),
|
||||
("inactive", _) => "inactive".to_string(),
|
||||
("failed", _) => "failed".to_string(),
|
||||
("activating", _) => "starting".to_string(),
|
||||
("deactivating", _) => "stopping".to_string(),
|
||||
_ => format!("{}:{}", active_state, sub_state),
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if service collection cache should be updated
|
||||
fn should_update_cache(&self) -> bool {
|
||||
let state = self.state.read().unwrap();
|
||||
|
||||
match state.last_collection {
|
||||
None => true,
|
||||
Some(last) => {
|
||||
let cache_duration = std::time::Duration::from_secs(30);
|
||||
last.elapsed() > cache_duration
|
||||
}
|
||||
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
*start_index = i - 1;
|
||||
|
||||
if !server_names.is_empty() && !has_redirect {
|
||||
return Some(server_names[0].clone());
|
||||
/// Get cached service data if available and fresh
|
||||
fn get_cached_services(&self) -> Option<Vec<ServiceInfo>> {
|
||||
if !self.should_update_cache() {
|
||||
let state = self.state.read().unwrap();
|
||||
Some(state.services.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for SystemdCollector {
|
||||
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
||||
// Use cached data if available and fresh
|
||||
if let Some(cached_services) = self.get_cached_services() {
|
||||
debug!("Using cached systemd services data");
|
||||
for service in cached_services {
|
||||
agent_data.services.push(ServiceData {
|
||||
name: service.name,
|
||||
status: service.status,
|
||||
memory_mb: service.memory_mb,
|
||||
disk_gb: service.disk_gb,
|
||||
user_stopped: false, // TODO: Integrate with service tracker
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
} else {
|
||||
// Collect fresh data
|
||||
self.collect_service_data(agent_data).await
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user