Christoffer Martinsson 2581435b10 Implement per-service disk usage monitoring
Replaced system-wide disk usage with accurate per-service tracking by scanning
service-specific directories. Services like sshd now correctly show minimal
disk usage instead of misleading system totals.

- Rename storage widget and add drive capacity/usage columns
- Move host display to main dashboard title for cleaner layout
- Replace separate alert displays with color-coded row highlighting
- Add per-service disk usage collection using du command
- Update services widget formatting to handle small disk values
- Restructure into workspace with dedicated agent and dashboard packages
2025-10-11 22:59:16 +02:00

389 lines
12 KiB
Rust

use async_trait::async_trait;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use serde_json::json;
use std::process::Stdio;
use std::time::Duration;
use tokio::process::Command;
use tokio::time::timeout;
use super::{AgentType, Collector, CollectorError, CollectorOutput};
#[derive(Debug, Clone)]
pub struct BackupCollector {
pub enabled: bool,
pub interval: Duration,
pub restic_repo: Option<String>,
pub backup_service: String,
pub timeout_ms: u64,
}
impl BackupCollector {
pub fn new(
enabled: bool,
interval_ms: u64,
restic_repo: Option<String>,
backup_service: String,
) -> Self {
Self {
enabled,
interval: Duration::from_millis(interval_ms),
restic_repo,
backup_service,
timeout_ms: 30000, // 30 second timeout for backup operations
}
}
async fn get_restic_snapshots(&self) -> Result<ResticStats, CollectorError> {
let repo = self
.restic_repo
.as_ref()
.ok_or_else(|| CollectorError::ConfigError {
message: "No restic repository configured".to_string(),
})?;
let timeout_duration = Duration::from_millis(self.timeout_ms);
// Get restic snapshots
let output = timeout(
timeout_duration,
Command::new("restic")
.args(["-r", repo, "snapshots", "--json"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?
.map_err(|e| CollectorError::CommandFailed {
command: format!("restic -r {} snapshots --json", repo),
message: e.to_string(),
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(CollectorError::CommandFailed {
command: format!("restic -r {} snapshots --json", repo),
message: stderr.to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let snapshots: Vec<ResticSnapshot> =
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse restic snapshots: {}", e),
})?;
// Get repository stats
let stats_output = timeout(
timeout_duration,
Command::new("restic")
.args(["-r", repo, "stats", "--json"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?
.map_err(|e| CollectorError::CommandFailed {
command: format!("restic -r {} stats --json", repo),
message: e.to_string(),
})?;
let repo_size_gb = if stats_output.status.success() {
let stats_stdout = String::from_utf8_lossy(&stats_output.stdout);
let stats: Result<ResticStats, _> = serde_json::from_str(&stats_stdout);
stats
.ok()
.map(|s| s.total_size as f32 / (1024.0 * 1024.0 * 1024.0))
.unwrap_or(0.0)
} else {
0.0
};
// Find most recent snapshot
let last_success = snapshots.iter().map(|s| s.time).max();
Ok(ResticStats {
total_size: (repo_size_gb * 1024.0 * 1024.0 * 1024.0) as u64,
snapshot_count: snapshots.len() as u32,
last_success,
})
}
async fn get_backup_service_status(&self) -> Result<BackupServiceData, CollectorError> {
let timeout_duration = Duration::from_millis(self.timeout_ms);
// Get systemctl status for backup service
let status_output = timeout(
timeout_duration,
Command::new("systemctl")
.args([
"show",
&self.backup_service,
"--property=ActiveState,SubState,MainPID",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?
.map_err(|e| CollectorError::CommandFailed {
command: format!("systemctl show {}", self.backup_service),
message: e.to_string(),
})?;
let enabled = if status_output.status.success() {
let status_stdout = String::from_utf8_lossy(&status_output.stdout);
status_stdout.contains("ActiveState=active")
|| status_stdout.contains("SubState=running")
} else {
false
};
// Check for backup timer or service logs for last message
let last_message = self.get_last_backup_log_message().await.ok();
// Check for pending backup jobs (simplified - could check systemd timers)
let pending_jobs = 0; // TODO: Implement proper pending job detection
Ok(BackupServiceData {
enabled,
pending_jobs,
last_message,
})
}
async fn get_last_backup_log_message(&self) -> Result<String, CollectorError> {
let output = Command::new("journalctl")
.args([
"-u",
&self.backup_service,
"--lines=1",
"--no-pager",
"--output=cat",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: format!("journalctl -u {} --lines=1", self.backup_service),
message: e.to_string(),
})?;
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
let message = stdout.trim().to_string();
if !message.is_empty() {
return Ok(message);
}
}
Err(CollectorError::ParseError {
message: "No log messages found".to_string(),
})
}
async fn get_backup_logs_for_failures(&self) -> Result<Option<DateTime<Utc>>, CollectorError> {
let output = Command::new("journalctl")
.args([
"-u",
&self.backup_service,
"--since",
"1 week ago",
"--grep=failed\\|error\\|ERROR",
"--output=json",
"--lines=1",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: format!(
"journalctl -u {} --since='1 week ago' --grep=failed",
self.backup_service
),
message: e.to_string(),
})?;
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
if let Ok(log_entry) = serde_json::from_str::<JournalEntry>(&stdout) {
if let Ok(timestamp) = log_entry.realtime_timestamp.parse::<i64>() {
let dt =
DateTime::from_timestamp_micros(timestamp).unwrap_or_else(|| Utc::now());
return Ok(Some(dt));
}
}
}
Ok(None)
}
fn determine_backup_status(
&self,
restic_stats: &Result<ResticStats, CollectorError>,
service_data: &BackupServiceData,
last_failure: Option<DateTime<Utc>>,
) -> BackupStatus {
match restic_stats {
Ok(stats) => {
if let Some(last_success) = stats.last_success {
let hours_since_backup =
Utc::now().signed_duration_since(last_success).num_hours();
if hours_since_backup > 48 {
BackupStatus::Warning // More than 2 days since last backup
} else if let Some(failure) = last_failure {
if failure > last_success {
BackupStatus::Failed // Failure after last success
} else {
BackupStatus::Healthy
}
} else {
BackupStatus::Healthy
}
} else {
BackupStatus::Warning // No successful backups found
}
}
Err(_) => {
if service_data.enabled {
BackupStatus::Failed // Service enabled but can't access repo
} else {
BackupStatus::Unknown // Service disabled
}
}
}
}
}
#[async_trait]
impl Collector for BackupCollector {
fn name(&self) -> &str {
"backup"
}
fn agent_type(&self) -> AgentType {
AgentType::Backup
}
fn collect_interval(&self) -> Duration {
self.interval
}
fn is_enabled(&self) -> bool {
self.enabled
}
fn requires_root(&self) -> bool {
false // Depends on restic repo permissions
}
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
// Get restic repository stats
let restic_stats = self.get_restic_snapshots().await;
// Get backup service status
let service_data = self
.get_backup_service_status()
.await
.unwrap_or(BackupServiceData {
enabled: false,
pending_jobs: 0,
last_message: None,
});
// Check for recent failures
let last_failure = self.get_backup_logs_for_failures().await.unwrap_or(None);
// Determine overall backup status
let overall_status =
self.determine_backup_status(&restic_stats, &service_data, last_failure);
let (backup_info, _size_gb) = match &restic_stats {
Ok(stats) => (
BackupInfo {
last_success: stats.last_success,
last_failure,
size_gb: stats.total_size as f32 / (1024.0 * 1024.0 * 1024.0),
snapshot_count: stats.snapshot_count,
},
stats.total_size as f32 / (1024.0 * 1024.0 * 1024.0),
),
Err(_) => (
BackupInfo {
last_success: None,
last_failure,
size_gb: 0.0,
snapshot_count: 0,
},
0.0,
),
};
let backup_metrics = json!({
"overall_status": overall_status,
"backup": backup_info,
"service": service_data,
"timestamp": Utc::now()
});
Ok(CollectorOutput {
agent_type: AgentType::Backup,
data: backup_metrics,
timestamp: Utc::now(),
})
}
}
#[derive(Debug, Deserialize)]
struct ResticSnapshot {
time: DateTime<Utc>,
}
#[derive(Debug, Deserialize)]
struct ResticStats {
total_size: u64,
snapshot_count: u32,
last_success: Option<DateTime<Utc>>,
}
#[derive(Debug, Serialize)]
struct BackupServiceData {
enabled: bool,
pending_jobs: u32,
last_message: Option<String>,
}
#[derive(Debug, Serialize)]
struct BackupInfo {
last_success: Option<DateTime<Utc>>,
last_failure: Option<DateTime<Utc>>,
size_gb: f32,
snapshot_count: u32,
}
#[derive(Debug, Serialize)]
enum BackupStatus {
Healthy,
Warning,
Failed,
Unknown,
}
#[derive(Debug, Deserialize)]
struct JournalEntry {
#[serde(rename = "__REALTIME_TIMESTAMP")]
realtime_timestamp: String,
}