All checks were successful
Build and Release / build-and-release (push) Successful in 1m15s
- Add disk wear percentage collection from SMART data in backup script - Add backup_disk_wear_percent metric to backup collector with thresholds - Display wear percentage in backup widget disk section - Fix storage section overflow handling to use consistent "X more below" logic - Update maintenance mode to return pending status instead of unknown
481 lines
18 KiB
Rust
481 lines
18 KiB
Rust
use async_trait::async_trait;
|
|
use chrono::Utc;
|
|
use cm_dashboard_shared::{Metric, MetricValue, Status, StatusTracker};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
use tokio::fs;
|
|
|
|
use super::{Collector, CollectorError};
|
|
use tracing::error;
|
|
|
|
/// Backup collector that reads TOML status files for borgbackup metrics
|
|
#[derive(Debug, Clone)]
|
|
pub struct BackupCollector {
|
|
pub backup_status_file: String,
|
|
pub max_age_hours: u64,
|
|
}
|
|
|
|
impl BackupCollector {
|
|
pub fn new(backup_status_file: Option<String>, max_age_hours: u64) -> Self {
|
|
Self {
|
|
backup_status_file: backup_status_file
|
|
.unwrap_or_else(|| "/var/lib/backup/backup-status.toml".to_string()),
|
|
max_age_hours,
|
|
}
|
|
}
|
|
|
|
async fn read_backup_status(&self) -> Result<Option<BackupStatusToml>, CollectorError> {
|
|
// Check if we're in maintenance mode
|
|
if std::fs::metadata("/tmp/cm-maintenance").is_ok() {
|
|
// Return special maintenance mode status
|
|
let maintenance_status = BackupStatusToml {
|
|
backup_name: "maintenance".to_string(),
|
|
start_time: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
|
current_time: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
|
duration_seconds: 0,
|
|
status: "pending".to_string(),
|
|
last_updated: chrono::Utc::now().format("%Y-%m-%d %H:%M:%S UTC").to_string(),
|
|
disk_space: None,
|
|
disk_product_name: None,
|
|
disk_serial_number: None,
|
|
disk_wear_percent: None,
|
|
services: HashMap::new(),
|
|
};
|
|
return Ok(Some(maintenance_status));
|
|
}
|
|
|
|
// Check if backup status file exists
|
|
if !std::path::Path::new(&self.backup_status_file).exists() {
|
|
return Ok(None); // File doesn't exist, but this is not an error
|
|
}
|
|
|
|
let content = fs::read_to_string(&self.backup_status_file)
|
|
.await
|
|
.map_err(|e| CollectorError::SystemRead {
|
|
path: self.backup_status_file.clone(),
|
|
error: e.to_string(),
|
|
})?;
|
|
|
|
let backup_status = toml::from_str(&content).map_err(|e| CollectorError::Parse {
|
|
value: "backup status TOML".to_string(),
|
|
error: e.to_string(),
|
|
})?;
|
|
|
|
Ok(Some(backup_status))
|
|
}
|
|
|
|
fn calculate_backup_status(&self, backup_status: &BackupStatusToml) -> Status {
|
|
// Parse the start time to check age - handle both RFC3339 and local timestamp formats
|
|
let start_time = match chrono::DateTime::parse_from_rfc3339(&backup_status.start_time) {
|
|
Ok(dt) => dt.with_timezone(&Utc),
|
|
Err(_) => {
|
|
// Try parsing as naive datetime and assume UTC
|
|
match chrono::NaiveDateTime::parse_from_str(
|
|
&backup_status.start_time,
|
|
"%Y-%m-%dT%H:%M:%S%.f",
|
|
) {
|
|
Ok(naive_dt) => naive_dt.and_utc(),
|
|
Err(_) => {
|
|
error!(
|
|
"Failed to parse backup timestamp: {}",
|
|
backup_status.start_time
|
|
);
|
|
return Status::Unknown;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
let hours_since_backup = Utc::now().signed_duration_since(start_time).num_hours();
|
|
|
|
// Check overall backup status
|
|
match backup_status.status.as_str() {
|
|
"success" => {
|
|
if hours_since_backup > self.max_age_hours as i64 {
|
|
Status::Warning // Backup too old
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
"failed" => Status::Critical,
|
|
"warning" => Status::Warning, // Backup completed with warnings
|
|
"running" => Status::Ok, // Currently running is OK
|
|
"pending" => Status::Pending, // Maintenance mode or backup starting
|
|
_ => Status::Unknown,
|
|
}
|
|
}
|
|
|
|
fn calculate_service_status(&self, service: &ServiceStatus) -> Status {
|
|
match service.status.as_str() {
|
|
"completed" => {
|
|
if service.exit_code == 0 {
|
|
Status::Ok
|
|
} else {
|
|
Status::Critical
|
|
}
|
|
}
|
|
"failed" => Status::Critical,
|
|
"disabled" => Status::Warning, // Service intentionally disabled
|
|
"running" => Status::Ok,
|
|
_ => Status::Unknown,
|
|
}
|
|
}
|
|
|
|
fn bytes_to_gb(bytes: u64) -> f32 {
|
|
bytes as f32 / (1024.0 * 1024.0 * 1024.0)
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Collector for BackupCollector {
|
|
|
|
async fn collect(&self, _status_tracker: &mut StatusTracker) -> Result<Vec<Metric>, CollectorError> {
|
|
let backup_status_option = self.read_backup_status().await?;
|
|
let mut metrics = Vec::new();
|
|
let timestamp = chrono::Utc::now().timestamp() as u64;
|
|
|
|
// If no backup status file exists, return minimal metrics indicating no backup system
|
|
let backup_status = match backup_status_option {
|
|
Some(status) => status,
|
|
None => {
|
|
// No backup system configured - return minimal "unknown" metrics
|
|
metrics.push(Metric {
|
|
name: "backup_overall_status".to_string(),
|
|
value: MetricValue::String("no_backup_system".to_string()),
|
|
status: Status::Unknown,
|
|
timestamp,
|
|
description: Some("No backup system configured (no status file found)".to_string()),
|
|
unit: None,
|
|
});
|
|
return Ok(metrics);
|
|
}
|
|
};
|
|
|
|
// Overall backup status
|
|
let overall_status = self.calculate_backup_status(&backup_status);
|
|
metrics.push(Metric {
|
|
name: "backup_overall_status".to_string(),
|
|
value: MetricValue::String(match overall_status {
|
|
Status::Ok => "ok".to_string(),
|
|
Status::Inactive => "inactive".to_string(),
|
|
Status::Pending => "pending".to_string(),
|
|
Status::Warning => "warning".to_string(),
|
|
Status::Critical => "critical".to_string(),
|
|
Status::Unknown => "unknown".to_string(),
|
|
Status::Offline => "offline".to_string(),
|
|
}),
|
|
status: overall_status,
|
|
timestamp,
|
|
description: Some(format!(
|
|
"Backup: {} at {}",
|
|
backup_status.status, backup_status.start_time
|
|
)),
|
|
unit: None,
|
|
});
|
|
|
|
// Backup duration
|
|
metrics.push(Metric {
|
|
name: "backup_duration_seconds".to_string(),
|
|
value: MetricValue::Integer(backup_status.duration_seconds),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Duration of last backup run".to_string()),
|
|
unit: Some("seconds".to_string()),
|
|
});
|
|
|
|
// Last backup timestamp - use last_updated (when backup finished) instead of start_time
|
|
let last_updated_dt_result =
|
|
chrono::DateTime::parse_from_rfc3339(&backup_status.last_updated)
|
|
.map(|dt| dt.with_timezone(&Utc))
|
|
.or_else(|_| {
|
|
// Try parsing as naive datetime and assume UTC
|
|
chrono::NaiveDateTime::parse_from_str(
|
|
&backup_status.last_updated,
|
|
"%Y-%m-%dT%H:%M:%S%.f",
|
|
)
|
|
.map(|naive_dt| naive_dt.and_utc())
|
|
});
|
|
|
|
if let Ok(last_updated_dt) = last_updated_dt_result {
|
|
metrics.push(Metric {
|
|
name: "backup_last_run_timestamp".to_string(),
|
|
value: MetricValue::Integer(last_updated_dt.timestamp()),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Timestamp of last backup completion".to_string()),
|
|
unit: Some("unix_timestamp".to_string()),
|
|
});
|
|
} else {
|
|
error!(
|
|
"Failed to parse backup timestamp for last_run_timestamp: {}",
|
|
backup_status.last_updated
|
|
);
|
|
}
|
|
|
|
// Individual service metrics
|
|
for (service_name, service) in &backup_status.services {
|
|
let service_status = self.calculate_service_status(service);
|
|
|
|
// Service status
|
|
metrics.push(Metric {
|
|
name: format!("backup_service_{}_status", service_name),
|
|
value: MetricValue::String(match service_status {
|
|
Status::Ok => "ok".to_string(),
|
|
Status::Inactive => "inactive".to_string(),
|
|
Status::Pending => "pending".to_string(),
|
|
Status::Warning => "warning".to_string(),
|
|
Status::Critical => "critical".to_string(),
|
|
Status::Unknown => "unknown".to_string(),
|
|
Status::Offline => "offline".to_string(),
|
|
}),
|
|
status: service_status,
|
|
timestamp,
|
|
description: Some(format!(
|
|
"Backup service {} status: {}",
|
|
service_name, service.status
|
|
)),
|
|
unit: None,
|
|
});
|
|
|
|
// Service exit code
|
|
metrics.push(Metric {
|
|
name: format!("backup_service_{}_exit_code", service_name),
|
|
value: MetricValue::Integer(service.exit_code),
|
|
status: if service.exit_code == 0 {
|
|
Status::Ok
|
|
} else {
|
|
Status::Critical
|
|
},
|
|
timestamp,
|
|
description: Some(format!("Exit code for backup service {}", service_name)),
|
|
unit: None,
|
|
});
|
|
|
|
// Repository archive count
|
|
metrics.push(Metric {
|
|
name: format!("backup_service_{}_archive_count", service_name),
|
|
value: MetricValue::Integer(service.archive_count),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some(format!("Number of archives in {} repository", service_name)),
|
|
unit: Some("archives".to_string()),
|
|
});
|
|
|
|
// Repository size in GB
|
|
let repo_size_gb = Self::bytes_to_gb(service.repo_size_bytes);
|
|
metrics.push(Metric {
|
|
name: format!("backup_service_{}_repo_size_gb", service_name),
|
|
value: MetricValue::Float(repo_size_gb),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some(format!("Repository size for {} in GB", service_name)),
|
|
unit: Some("GB".to_string()),
|
|
});
|
|
|
|
// Repository path for reference
|
|
metrics.push(Metric {
|
|
name: format!("backup_service_{}_repo_path", service_name),
|
|
value: MetricValue::String(service.repo_path.clone()),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some(format!("Repository path for {}", service_name)),
|
|
unit: None,
|
|
});
|
|
}
|
|
|
|
// Total number of services
|
|
metrics.push(Metric {
|
|
name: "backup_total_services".to_string(),
|
|
value: MetricValue::Integer(backup_status.services.len() as i64),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Total number of backup services".to_string()),
|
|
unit: Some("services".to_string()),
|
|
});
|
|
|
|
// Calculate total repository size
|
|
let total_size_bytes: u64 = backup_status
|
|
.services
|
|
.values()
|
|
.map(|s| s.repo_size_bytes)
|
|
.sum();
|
|
let total_size_gb = Self::bytes_to_gb(total_size_bytes);
|
|
metrics.push(Metric {
|
|
name: "backup_total_repo_size_gb".to_string(),
|
|
value: MetricValue::Float(total_size_gb),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Total size of all backup repositories".to_string()),
|
|
unit: Some("GB".to_string()),
|
|
});
|
|
|
|
// Disk space metrics for backup directory
|
|
if let Some(ref disk_space) = backup_status.disk_space {
|
|
metrics.push(Metric {
|
|
name: "backup_disk_total_gb".to_string(),
|
|
value: MetricValue::Float(disk_space.total_gb as f32),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Total disk space available for backups".to_string()),
|
|
unit: Some("GB".to_string()),
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: "backup_disk_used_gb".to_string(),
|
|
value: MetricValue::Float(disk_space.used_gb as f32),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Used disk space on backup drive".to_string()),
|
|
unit: Some("GB".to_string()),
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: "backup_disk_available_gb".to_string(),
|
|
value: MetricValue::Float(disk_space.available_gb as f32),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Available disk space on backup drive".to_string()),
|
|
unit: Some("GB".to_string()),
|
|
});
|
|
|
|
metrics.push(Metric {
|
|
name: "backup_disk_usage_percent".to_string(),
|
|
value: MetricValue::Float(disk_space.usage_percent as f32),
|
|
status: if disk_space.usage_percent >= 95.0 {
|
|
Status::Critical
|
|
} else if disk_space.usage_percent >= 85.0 {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
},
|
|
timestamp,
|
|
description: Some("Backup disk usage percentage".to_string()),
|
|
unit: Some("percent".to_string()),
|
|
});
|
|
|
|
// Add disk identification metrics if available from disk_space
|
|
if let Some(ref product_name) = disk_space.product_name {
|
|
metrics.push(Metric {
|
|
name: "backup_disk_product_name".to_string(),
|
|
value: MetricValue::String(product_name.clone()),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Backup disk product name from SMART data".to_string()),
|
|
unit: None,
|
|
});
|
|
}
|
|
|
|
if let Some(ref serial_number) = disk_space.serial_number {
|
|
metrics.push(Metric {
|
|
name: "backup_disk_serial_number".to_string(),
|
|
value: MetricValue::String(serial_number.clone()),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Backup disk serial number from SMART data".to_string()),
|
|
unit: None,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Add standalone disk identification metrics from TOML fields
|
|
if let Some(ref product_name) = backup_status.disk_product_name {
|
|
metrics.push(Metric {
|
|
name: "backup_disk_product_name".to_string(),
|
|
value: MetricValue::String(product_name.clone()),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Backup disk product name from SMART data".to_string()),
|
|
unit: None,
|
|
});
|
|
}
|
|
|
|
if let Some(ref serial_number) = backup_status.disk_serial_number {
|
|
metrics.push(Metric {
|
|
name: "backup_disk_serial_number".to_string(),
|
|
value: MetricValue::String(serial_number.clone()),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some("Backup disk serial number from SMART data".to_string()),
|
|
unit: None,
|
|
});
|
|
}
|
|
|
|
if let Some(wear_percent) = backup_status.disk_wear_percent {
|
|
let wear_status = if wear_percent >= 90.0 {
|
|
Status::Critical
|
|
} else if wear_percent >= 75.0 {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
};
|
|
|
|
metrics.push(Metric {
|
|
name: "backup_disk_wear_percent".to_string(),
|
|
value: MetricValue::Float(wear_percent),
|
|
status: wear_status,
|
|
timestamp,
|
|
description: Some("Backup disk wear percentage from SMART data".to_string()),
|
|
unit: Some("percent".to_string()),
|
|
});
|
|
}
|
|
|
|
// Count services by status
|
|
let mut status_counts = HashMap::new();
|
|
for service in backup_status.services.values() {
|
|
*status_counts.entry(service.status.clone()).or_insert(0) += 1;
|
|
}
|
|
|
|
for (status_name, count) in status_counts {
|
|
metrics.push(Metric {
|
|
name: format!("backup_services_{}_count", status_name),
|
|
value: MetricValue::Integer(count),
|
|
status: Status::Ok,
|
|
timestamp,
|
|
description: Some(format!("Number of services with status: {}", status_name)),
|
|
unit: Some("services".to_string()),
|
|
});
|
|
}
|
|
|
|
Ok(metrics)
|
|
}
|
|
}
|
|
|
|
/// TOML structure for backup status file
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
pub struct BackupStatusToml {
|
|
pub backup_name: String,
|
|
pub start_time: String,
|
|
pub current_time: String,
|
|
pub duration_seconds: i64,
|
|
pub status: String,
|
|
pub last_updated: String,
|
|
pub disk_space: Option<DiskSpace>,
|
|
pub disk_product_name: Option<String>,
|
|
pub disk_serial_number: Option<String>,
|
|
pub disk_wear_percent: Option<f32>,
|
|
pub services: HashMap<String, ServiceStatus>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
pub struct DiskSpace {
|
|
pub total_bytes: u64,
|
|
pub used_bytes: u64,
|
|
pub available_bytes: u64,
|
|
pub total_gb: f64,
|
|
pub used_gb: f64,
|
|
pub available_gb: f64,
|
|
pub usage_percent: f64,
|
|
// Optional disk identification fields
|
|
pub product_name: Option<String>,
|
|
pub serial_number: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
|
pub struct ServiceStatus {
|
|
pub status: String,
|
|
pub exit_code: i64,
|
|
pub repo_path: String,
|
|
pub archive_count: i64,
|
|
pub repo_size_bytes: u64,
|
|
}
|